diff --git a/modules/bibformat/doc/hacking/bibformat-api.webdoc b/modules/bibformat/doc/hacking/bibformat-api.webdoc
index e703eec65..dbc9f41ec 100644
--- a/modules/bibformat/doc/hacking/bibformat-api.webdoc
+++ b/modules/bibformat/doc/hacking/bibformat-api.webdoc
@@ -1,866 +1,885 @@
## -*- mode: html; coding: utf-8; -*-
## $Id$
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
****************************************************************************
** IMPORTANT NOTE: Note that this documentation is an updated version of **
** an earlier technical draft of BibFormat specifications. Please first **
** refer to the BibFormat admin guide. **
****************************************************************************
Technical Overview of the new BibFormat
=======================================
Contents:
1. Python API
2. The philosophy behind BibFormat
3. Differences between the old PHP version and the new Pythonic version
4. Migrating from the previous PHP BibFormat version to the new Pythonic version
5. Specifications of the new BibFormat configuration files.
1. Python API
The APIs of bibformat.py consists in these functions:
def format_record(recID, of, ln=cdslang, verbose=0,
- search_pattern=None, xml_record=None, uid=None,
+ search_pattern=None, xml_record=None, user_info=None,
on_the_fly=False):
"""
Formats a record given its ID (or its XML representation)
and an output format.
Returns a formatted version of the record in the specified
language, with pattern context, and specified output format.
The function will define by itself which format template must be
applied.
Parameters that allow contextual formatting (like 'search_pattern'
- and 'uid') are useful only when doing on-the-fly formatting,
- or when caching with care (e.g. caching all formatted
+ and 'user_info') are useful only when doing on-the-fly
+ formatting, or when caching with care (e.g. caching all formatted
versions of a record for each possible 'ln').
The arguments are as follows:
recID - the ID of the record to format. If ID does not exist
the function returns empty string or an error
string, depending on level of verbosity.
If 'xml_record' parameter is specified, 'recID'
is ignored
of - an output format code. If 'of' does not exist as code in
output format, the function returns empty
string or an error string, depending on level
of verbosity. ;of' is case insensitive.
ln - the language to use to format the record. If
'ln' is an unknown language, or translation
does not exist, default cdslang language
will be applied whenever possible.
Allows contextual formatting.
verbose - the level of verbosity in case of errors/warnings
0 - Silent mode
5 - Prints only errors
9 - Prints errors and warnings
search_pattern - the pattern used as search query when asked to
format this record (User request in web
interface). Allows contextual formatting.
xml_record - an XML string representation of the record to
format. If it is specified, recID parameter is
ignored. The XML must be pasable by BibRecord.
- uid - User ID of the user who will view the formatted
- record. Useful to grant access to special
- functions on a page depending on user's
- priviledge. Allows contextual formatting.
- Typically 'uid' is retrieved with webuser.getUid(req).
+ user_info - allows to grant access to some functionalities
+ on a page depending on the user's
+ priviledges. 'user_info' is the same structure
+ as the one returned by webuser.collect_user_info(req),
+ (that is a dictionary).
- on_the_fly - if False, try to return an already preformatted version
- of the record in the database.
+ on_the_fly - if False, try to return an already preformatted
+ version of the record in the database.
"""
Example:
>> from invenio.bibformat import format_record
>> format_record(5, "hb", "fr")
def format_records(recIDs, of, ln=cdslang, verbose=0, search_pattern=None,
- xml_records=None, uid=None, record_prefix=None,
+ xml_records=None, user_info=None, record_prefix=None,
record_separator=None, record_suffix=None,
prologue="", epilogue="", req=None, on_the_fly=False):
"""
Returns a list of formatted records given by a list of record IDs or a
list of records as xml.
Adds a prefix before each record, a suffix after each record,
plus a separator between records.
Also add optional prologue and epilogue to the complete formatted list.
You can either specify a list of record IDs to format, or a list of
xml records, but not both (if both are specified recIDs is ignored).
'record_separator' is a function that returns a string as separator between
records. The function must take an integer as unique parameter,
which is the index in recIDs (or xml_records) of the record that has
just been formatted. For example separator(i) must return the separator
between recID[i] and recID[i+1]. Alternatively separator can be a single
string, which will be used to separate all formatted records.
The same applies to 'record_prefix' and 'record_suffix'.
'req' is an optional parameter on which the result of the function
are printed lively (prints records after records) if it is given.
Note that you should set 'req' content-type by yourself, and send
http header before calling this function as it will not do it.
This function takes the same parameters as 'format_record' except for:
recIDs - a list of record IDs to format
xml_records - a list of xml string representions of the records to
format. If this list is specified, 'recIDs' is ignored.
record_prefix - a string or a function the takes the index of the record
in 'recIDs' or 'xml_records' for which the function must
return a string.
Printed before each formatted record.
record_separator - either a string or a function that returns string to
separate formatted records. The function takes the index
of the record in 'recIDs' or 'xml_records' that is being
formatted.
record_prefix - a string or a function the takes the index of the record
in 'recIDs' or 'xml_records' for which the function must
return a string.
Printed after each formatted record
req - an optional request object on which formatted records
can be printed (for "live" output )
prologue - a string printed before all formatted records string
epilogue - a string printed after all formatted records string
on_the_fly - if False, try to return an already preformatted version
of the records in the database
"""
def get_output_format_content_type(of):
"""
Returns the content type (eg. 'text/html' or 'application/ms-excel') \
of the given output format.
The function takes this mandatory parameter:
of - the code of output format for which we want to get the content type
"""
def record_get_xml(recID, format='xm', decompress=zlib.decompress):
"""
Returns an XML string of the record given by recID.
The function builds the XML directly from the database,
without using the standard formatting process.
'format' allows to define the flavour of XML:
- 'xm' for standard XML
- 'marcxml' for MARC XML
- 'oai_dc' for OAI Dublin Core
- 'xd' for XML Dublin Core
If record does not exist, returns empty string.
The function takes the following parameters:
recID - the id of the record to retrieve
format - the XML flavor in which we want to get the record
decompress _ a function used to decompress the record from the database
"""
The API of the BibFormat Object ('bfo') given as a parameter to
format function of format elements consist in the following
functions. This API is to be used only inside format elements.
def control_field(self, tag, escape='0'):
"""
Returns the value of control field given by tag in record.
If the value does not exist, returns empty string
The returned value is always a string.
'escape' parameter allows to escape special characters
of the field. The value of escape can be:
0 - no escaping
1 - escape all HTML characters
2 - escape all HTML characters by default. If field starts with ,
escape only unsafe characters, but leave basics HTML tags.
This is particularly useful if you want to store HTML text in your
metadata but still want to escape some tags to prevent
XSS vulnerabilities. Note that this method is slower than
basic escaping of mode 1.
The arguments are:
tag - the marc code of a field
escape - 1 if returned value should be escaped. Else 0.
(see above for other modes)
"""
def field(self, tag, escape='0'):
"""
Returns the value of the field corresponding to tag in the
current record.
If the value does not exist, returns empty string
The returned value is always a string.
'escape' parameter allows to escape special characters
of the field. The value of escape can be:
0 - no escaping
1 - escape all HTML characters
2 - escape all HTML characters by default. If field starts with ,
escape only unsafe characters, but leaves basic HTML tags.
This is particularly useful if you want to store HTML text in your
metadata but still want to escape some tags to prevent
XSS vulnerabilities. Note that this method is slower than
basic escaping of mode 1.
The arguments are:
tag - the marc code of a field
escape - 1 if returned value should be escaped. Else 0.
(see above for other modes)
"""
def fields(self, tag, escape='0', repeatable_subfields_p=False):
"""
Returns the list of values corresonding to "tag".
If tag has an undefined subcode (such as 999C5),
the function returns a list of dictionaries, whoose keys
are the subcodes and the values are the values of tag.subcode.
If the tag has a subcode, simply returns list of values
corresponding to tag.
Eg. for given MARC:
999C5 $a value_1a $b value_1b
999C5 $b value_2b
999C5 $b value_3b $b value_3b_bis
>> bfo.fields('999C5b')
>> ['value_1b', 'value_2b', 'value_3b', 'value_3b_bis']
>> bfo.fields('999C5')
>> [{'a':'value_1a', 'b':'value_1b'},
{'b':'value_2b'},
{'b':'value_3b'}]
By default the function returns only one value for each
subfield (that is it considers that repeatable subfields are
not allowed). It is why in the above example 'value3b_bis' is
not shown for bfo.fields('999C5'). (Note that it is not
defined which of value_3b or value_3b_bis is returned). This
is to simplify the use of the function, as most of the time
subfields are not repeatable (in that way we get a string
instead of a list). You can allow repeatable subfields by
setting 'repeatable_subfields_p' parameter to True. In
this mode, the above example would return:
>> bfo.fields('999C5b', repeatable_subfields_p=True)
>> ['value_1b', 'value_2b', 'value_3b']
>> bfo.fields('999C5', repeatable_subfields_p=True)
>> [{'a':['value_1a'], 'b':['value_1b']},
{'b':['value_2b']},
{'b':['value_3b', 'value3b_bis']}]
NOTICE THAT THE RETURNED STRUCTURE IS DIFFERENT. Also note
that whatever the value of 'repeatable_subfields_p' is,
bfo.fields('999C5b') always show all fields, even repeatable
ones. This is because the parameter has no impact on the
returned structure (it is always a list).
'escape' parameter allows to escape special characters
of the field. The value of escape can be:
0 - no escaping
1 - escape all HTML characters
2 - escape all HTML characters by default. If field starts with ,
escape only unsafe characters, but leaves basic HTML tags.
This is particularly useful if you want to store HTML text in your
metadata but still want to escape some tags to prevent
XSS vulnerabilities. Note that this method is slower than
basic escaping of mode 1.
The arguments are:
tag - the marc code of a field
escape - 1 if returned value should be escaped. Else 0.
(see above for other modes)
"""
def kb(self, kb, string, default=""):
"""
Returns the value of the "string" in the knowledge base "kb".
If kb does not exist or string does not exist in kb,
returns 'default' string or empty string if not specified
The arguments are as follows:
kb - the knowledge base name in which we want to find the mapping.
If it does not exist the function returns the original
'string' parameter value. The name is case insensitive (Uses
the SQL 'LIKE' syntax to retrieve value).
string - the value for which we want to find a translation-
If it does not exist the function returns 'default' string.
The string is case insensitive (Uses the SQL 'LIKE' syntax
to retrieve value).
default - a default value returned if 'string' not found in 'kb'.
"""
def get_record(self):
"""
Returns the record encapsulated in bfo as a BibRecord structure.
You can get full access to the record through bibrecord.py functions.
"""
Example (from inside BibFormat element):
>> bfo.field("520.a")
>> 'We present a quantitative appraisal of the physics potential
for neutrino experiments.'
>>
>> bfo.control_field("001")
>> '12'
>>
>> bfo.fields("700.a")
>>['Alekhin, S I', 'Anselmino, M', 'Ball, R D', 'Boglione, M']
>>
>> bfo.kb("DBCOLLID2COLL", "ARTICLE")
>> 'Published Article'
>>
>> bfo.kb("DBCOLLID2COLL", "not in kb", "My Value")
>> 'My Value'
Moreover you can have access to the language requested for the
formatting, the search pattern used by the user in the web
interface and the userID by directly getting the attribute from 'bfo':
bfo.lang
"""
Returns the language that was asked to be used for the
formatting. Always returns a string.
"""
bfo.search_pattern
"""
Returns the search pattern specified by the user when
the record had to be formatted. Always returns a string.
"""
+ bfo.user_info
+ """
+ Returns a dictionary with information about current user.
+ The returned dictionary has the following structure:
+ user_info = {
+ 'remote_ip' : '',
+ 'remote_host' : '',
+ 'referer' : '',
+ 'uri' : '',
+ 'agent' : '',
+ 'apache_user' : '',
+ 'apache_group' : [],
+ 'uid' : -1,
+ 'nickname' : '',
+ 'email' : '',
+ 'group' : [],
+ 'guest' : '1'
+ }
+ """
+
bfo.uid
"""
- Returns the user ID of the user who shall view the formatted
- record.
+ ! DEPRECATED: use bfo.user_info['uid'] instead
"""
bfo.recID
"""
Returns the id of the record
"""
bfo.req
"""
- Returns the mod_python request object
+ ! DEPRECATED: use bfo.user_info instead
"""
bfo.format
"""
Returns the format in which the record is being formatted
"""
Example (from inside BibFormat element):
>> bfo.lang
>> 'en'
>>
>> bfo.search_pattern
>> 'mangano and neutrino and factory'
2. The philosophy behind BibFormat
BibFormat is in charge of formatting the bibliographic records that
are displayed to your users. As you potentially have a huge amount of
bibliographic records, you cannot specify manually for each of them
how it should be formatted. This is why you can define rules that will
allow BibFormat to understand which kind of formatting to apply to a given
record. You define this set of rules in what is called an "output
format".
You can have different output formats, each with its own characteristics.
For example you certainly want that when multiple bibliographic records are
displayed at the same time (as it happens in search results), only
short versions are shown to the user, while a detailed record is
preferable when a single record is displayed. You might also want to
let your users decide which kind of output they want. For example you
might need to display HTML for regular web browsing, but would also
give a BibTeX version of the bibliographic reference for direct
inclusion in a LaTeX document.
See section 5.1 to learn how to create or modify output formats.
While output formats define what kind of formatting must be applied,
they do not define HOW the formatting is done. This is the role of the
"format templates", which define the layout and look of a
bibliographic reference. These format templates are rather easy to
write if you know a little bit of HTML (see section 5.2 "Format
templates specifications"). You will certainly have to create
different format templates, for different kinds of records. For
example you might want records that contain pictures to display them,
maybe with captions, while records that do not have pictures limited
to printing a title and an abstract.
In summary, you have different output formats (like 'brief HTML',
'detailed HTML' or 'BibTeX') that call different format templates
according to some criteria.
There is still one kind of configuration file that we have not talked
about: the "format elements". These are the "bricks" that you use in
format templates, to get the values of a record. You will learn to use
them in your format template in section 5.2 "Format templates
specifications", but you will mostly not need to modify them or create
new ones. However if you do need to edit one, read section 5.3 "Format
elements specifications" (And if you know Python it will be easy, as
they are written in Python).
Finally BibFormat can make use of mapping tables called "knowledge
bases". Their primary use is to act like a translation table, to
normalize records before displaying them. For example, you can say
that records that have value "Phys Rev D" or "Physical Review D" for
field "published in" must display "Phys Rev : D." to users. See
section 5.4 to learn how to edit knowledge bases.
In summary, there are three layers. Output formats:
+-----------------------------------------------------+
| Output Format | (Layer 1)
| eg: HTML_Brief.bfo |
+-----------------------------------------------------+
call one of several `format templates':
+-------------------------+ +-------------------------+
| Format Template | | Format Template | (Layer 2)
| eg: preprint.bft | | eg: default.bft |
+-------------------------+ +-------------------------+
that use one or several format elements:
+--------------+ +----------------+ +-----------------+
|Format Element| |Format Element | | Format Element | (Layer 3)
|eg: authors.py| |eg: abstract.py | | eg: title.py |
+--------------+ +----------------+ +-----------------+
3. Differences between the old PHP version and the new Pythonic version
The most noticeable differences are:
a) "Behaviours" have been renamed "Output formats".
b) "Formats" have been renamed "Format templates". They are now
written in HTML.
c) "User defined functions" have been dropped.
d) "Extraction rules" have been dropped.
e) "Link rules" have been dropped.
f) "File formats" have been dropped.
g) "Format elements" have been introduced. They are written in Python,
and can simulate c), d) and e).
h) Formats can be managed through web interface or through
human-readable config files.
i) Introduction of tools like validator and dependencies checker.
j) Better support for multi-language formatting.
Some of the advantages are:
+ Management of formats is much clearer and easier (less concepts,
more tools).
+ Writing formats is easier to learn : less concepts
to learn, redesigned work-flow, use of existing well known and
well documented languages.
+ Editing formats is easier: You can use your preferred HTML editor such as
Emacs, Dreamweaver or Frontpage to modify templates, or any text
editor for output formats and format elements. You can also use the
simplified web administration interface.
+ Faster and more powerful templating system.
+ Separation of business logic (output formats, format elements)
and presentation layer (format templates). This makes the management
of formats simpler.
The disadvantages are:
- No backward compatibility with old formats.
- Stricter separation of business logic and presentation layer:
no more use of statements such as if(), forall() inside templates,
and this requires more work to put logic inside format elements.
4. Migrating from the previous PHP BibFormat version to the new Pythonic version
Old BibFormat formats are no longer compatible with the new BibFormat
files. If you have not modified the "Formats" or modified only a
little bit the "Behaviours", then the transition will be painless and
automatic. Otherwise you will have to manually rewrite some of the
formats. This should however not be a big problem. Firstly because the
CDS Invenio installation will provide both versions of BibFormat for
some time. Secondly because both BibFormat versions can run side by
side, so that you can migrate your formats while your server still
works with the old formats. Thirdly because we provide a migration
kit that can help you go through this process. Finally because the
migration is not so difficult, and because it will be much easier for
you to customize how BibFormat formats your bibliographic data.
Concerning the migration kit it can:
a) Effortlessly migrate your behaviours, unless they include complex
logic, which usually they don't.
b) Help you migrate formats to format templates and format elements.
c) Effortlessly migrate your knowledge bases.
Point b) is the most difficult to achieve: previous formats did mix
business logic and code for the presentation, and could use PHP
functions. The new BibFormat separates business logic and
presentation, and does not support PHP. The transition kit will try to
move business logic to the format elements, and the presentation to
the format templates. These files will be created for you, includes
the original code and, if possible, a proposal of Python
translation. We recommend that you do not to use the transition kit to
translate formats, especially if you have not modified default
formats, or only modified default formats in some limited places. You
will get cleaner code if you write format elements and format
templates yourself.
5. Specifications of the new BibFormat configuration files.
BibFormat uses human readable configuration files. However (apart
from format elements) these files can be edited and managed through
a web interface.
5.1 Output formats specifications
Output formats specify rules that define which format template
to use to format a record.
While the syntax of output formats is basic, we recommend that you use
the web interface do edit them, to be sure that you make no error.
The syntax of output format is the following one. First you
define which field code you put as the conditon for the rule.
You suffix it with a column. Then on next lines, define the values of
the condition, followed by --- and then the filename of the template
to use:
tag 980.a:
PICTURE --- PICTURE_HTML_BRIEF.bft
PREPRINT --- PREPRINT_HTML_BRIEF.bft
PUBLICATION --- PUBLICATION_HTML_BRIEF.bft
This means that if value of field 980.a is equal to PICTURE, then we
will use format template PICTURE_HTML_BRIEF.bft. Note that you must
use the filename of the template, not the name. Also note that spaces
at the end or beginning are not considered. On the following lines,
you can either put other conditions on tag 980.a, or add another tag on
which you want to put conditions.
At the end you can add a default condition:
default: PREPRINT_HTML_BRIEF.bft
which means that if no condition is matched, a format suitable for
Preprints will be used to format the current record.
The output format file could then look like this:
tag 980.a:
PICTURE --- PICTURE_HTML_BRIEF.bft
PREPRINT --- PREPRINT_HTML_BRIEF.bft
PUBLICATION --- PUBLICATION_HTML_BRIEF.bft
tag 8560.f:
.*@cern.ch --- SPECIAL_MEMBER_FORMATTING.bft
default: PREPRINT_HTML_BRIEF.bft
You can add as many rules as you want. Keep in mind that they are read
in the order they are defined, and that only first rule that
matches will be used.
Notice the condition on tag 8560.f: it uses a regular expression to
match any email address that ends with @cern.ch (the regular
expression must be understandable by Python)
Some other considerations on the management of output formats:
- Format outputs must be placed inside directory
/etc/bibformat/outputs/ of your CDS Invenio installation.
- Note that as long as you have not provided a name to an output
THROUGH the web interface, it will not be available as a choice
for your users in some parts of CDS Invenio.
- You should remove output formats THROUGH the web interface.
- The format extension of output format is .bfo
5.2 Format templates specifications
Format templates are written in HTML-like syntax. You can use the
standard HTML and CSS markup languague to do the formatting. The best
thing to do is to create a page in your favourite editor, and once you
are glad with it, add the dynamic part of the page, that is print the
fields of the records. Let's say you have defined this page:
<h1>Some title</h1>
<p><i>Abstract: </i>Some abstract</p>
Then you want that instead of "Some title" and "Some abstract", the
value of the current record that is being displayed is used. To do so,
you must use a format element brick. Either you know the name of the
brick by heart, or you look for it in the elements documentation (see
section 5.3). For example you would find there that you can print the
title of the record by writting the HTML tag <BFE_TITLE /> in your
format template, with parameter 'default' for a default value.
<h1><BFE_TITLE default="No Title"/></h1>
<p><BFE_ABSTRACT limit="1" prefix="<i>Abstract: </i>"
default="No abstract"/></p>
Notice that <BFE_ABSTRACT /> has a parameter "limit" that <BFE_title/>
had not ("limit" allows to limit the number of sentences of the
abstract, according to the documentation). Note that while format
elements might have different parameters, they always can take the the
three following ones: "prefix" and "suffix", whose values are printed
only if the element is not empty, and "default", which is printed only
if element is an empty string. We have used "prefix" for the abstract,
so that the label "<i>Abstract: </i>" is only printed if the record
has an abstract.
You should also provide these tags in all of your templates:
-<name>a name for this template in the admin web interface</name>
-<description>a description to be used in admin web interface for
this template</description>
Another feature of the templates is the support for multi-languages
outputs. You can include <lang> tags, which contain tags labeled with
the names of the languages supported in CDS Invenio. For example, one
might write:
<lang><en>A record:</en><fr>Une notice:</fr></lang>
<h1><BFE_TITLE default="No Title"/></h1>
<p><BFE_ABSTRACT limit="1" prefix="<i>Abstract: </i>"
default="No abstract"/></p>
When doing this you should at least make sure that the default
language of your server installation is available in each <lang>
tag. It is the one that is used if the requested language to display
the record is not available. Note that we could also provide a
translation in a similar way for the "No Title" default value inside
<BFE_Title /> tag.
Some other considerations on the use of elements inside templates:
-Format elements names are not case sensitive
-Format elements names always start with <BFE_
-Format elements parameters can contain '<' characters,
and quotes different from the kind that delimit parameters (you can
for example have <BFE_Title default='<a href="#">No Title</a>'/> )
-Format templates must be placed inside the directory
/etc/bibformat/templates/ of your CDS Invenio installation
-The format extension of a template is .bft
Trick: you can use the <BFE_FIELD tag="245__a" /> to print the value
of any field 245 $a in your templates. This practice is however not
recommended because it would necessitate to revise all format
templates if you change meaning of the MARC code schema.
5.3 Format elements specifications
Format elements are the bricks used in format templates to provide the
dynamic contents inside format templates.
For the most basic format elements, you do not even need to write
them: as long as you define `tag names' for MARC tags in the BibIndex
Admin's Manage logical fields interface (database table tag),
BibFormat knows which field must be printed when <BFE_tag_name/> is
used inside a template.
However for more complex processing, you will need to write a format
element. A format element is written in Python. Therefore its file
extension is ".py". The name you choose for the file is the one that
will be used inside format template to call the element, so choose it
carefully such that it is not too long, but self explanatory (you can
prefix the filename with BFE or not, but the element will always be called
with prefix <BFE_ inside templates). Then you just need to drop the
file in the /lib/python/invenio/bibformat_elements/ directory
of your CDS Invenio installation. Inside your file you have to define a
function named "format", which takes at least a "bfo" parameter (bfo
for BibFormat Object). The function must return a string:
def format(bfo):
out = ""
return out
You can have as many parameters as you want, as long as you make sure
that parameter bfo is here. Let's see how to define an element that
will print a brief title. It will take a parameter 'limit' that will
limit the number of characters printed. We can provide some
documentation for the elemen in the docstring of the
function.
def format(bfo, limit="10"):
"""
Prints a short title
@param limit a limit for the number of printed characters
"""
out = ""
return out
Note that we put a default value of 10 in the 'limit' parameter. To
get some value of a field, we must request the 'bfo' object. For
example we can get the value of field 245.a (field "title"):
def format(bfo, limit="10"):
"""
Prints a short title
@param limit a limit for the number of printed characters
"""
title = bfo.field('245.a')
limit = int(limit)
if limit > len(title):
limit = len(title)
return title[:limit]
As format elements are written in Python, we have decided not to give
permission to edit elements through the web interface. Firstly for
security reasons. Secondly because Python requires correct indentation,
which is difficult to achieve through a web interface.
You can have access to the documentation of your element through a web
interface. This is very useful when you are writing a format template,
to see which elements are available, what they do, which parameters they
take, what are the default values of parameters, etc. The
documentation is automatically extracted from format elements.
Here follows an sample documentation generated for the element
<BFE_TITLE />:
+--------------------------------------------------------------------------------------------+
| TITLE |
| ----- |
| <BFE_TITLE separator="..." prefix="..." suffix="..." default="..." /> |
| |
| Prints the title of a record. |
| |
| Parameters: |
| separator - separator between the different titles. |
| prefix - A prefix printed only if the record has a value for this element. |
| suffix - A suffix printed only if the record has a value for this element. |
| default - A default value printed if the record has no value for this element. |
| |
| See also: |
| Format templates that use this element |
| The Python code of this element |
+--------------------------------------------------------------------------------------------+
The more you provide documentation in the docstring of your elements,
the easier it will be to write format template afterwards.
Some remarks concerning format elements:
-parameters are always string values
-if no value is given as parameter in format the template, then the
value of parameter is "" (emtpy string)
-the docstring should contain a description, followed by
"@param parameter some description for parameter" for each
parameter (to give description for each parameter
in element documentation), and @see an_element.py, another_element.py
(to link to other elements in the documentation). Similar to JavaDoc.
-the following names cannot be used as parameters:
"default", "prefix", "suffix" and escape. They can however always be
used in the format template for any element.
Another important remark concerns the 'escaping' of output of format
elements. In most cases, format elements output is to be used for
HTML/XML. Therefore special characters such as < or & have to be
'escaped', replaced by '<' and '&'. This is why all outputs
produced by format elements are automatically escaped by BibFormat,
unless specified otherwise. This means that you do not have to care
about meta-data that would break your HTML displaying or XML export
such as a physics formula like 'a < b'. Please also note that value
given in 'prefix', 'suffix' and 'default' parameters are not escaped,
such that you can safely use HTML tags for these.
There are always cases where the default 'escaping' behaviour of
BibFormat is not desired. For example when you explicitely output HTML
text, like links: you do not want to see them escaped. The first way
to avoid this is to modify the call to your format element in the
format template, by setting the default 'escape' parameter to 0:
This is however inconvenient as you have to possibly need to modify a
lot of templates. The other way of doing is to add another function to
your format element, named 'escape':
def escape_values(bfo):
"""
Called by BibFormat in order to check if output of this element
should be escaped.
"""
return 0
In that way all calls to your format element will produce unescaped
output. You will have to take care of escaping values "manually" in
your format element code, in order to avoid non valid outputs or XSS
vulnerabilities. There are methods to ease the escaping in your code
described in section 1.
Please also note that if you use this method, your element can still
be escaped if a call to your element from a format template
explicitely specifies to escape value using parameter 'escape'.
5.4 Knowledge bases specifications
Knowledge bases cannot be managed through configuration files.
You can very easily add new bases and mappings using the given web GUI.
-- End of file --
diff --git a/modules/bibformat/lib/bibformat.py b/modules/bibformat/lib/bibformat.py
index bef441260..a62f7ac8a 100644
--- a/modules/bibformat/lib/bibformat.py
+++ b/modules/bibformat/lib/bibformat.py
@@ -1,436 +1,449 @@
# -*- coding: utf-8 -*-
##
## $Id$
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
Format records using specified format.
-API functions: format_record, format_records, create_excel, get_output_format_content_type
+API functions: format_record, format_records, create_excel,
+ get_output_format_content_type
-Used to wrap the BibFormat engine and associated functions. This is also where
-special formatting of multiple records (that the engine does not handle, as it works
-on a single record basis) should be put, with name create_*.
+Used to wrap the BibFormat engine and associated functions. This is
+also where special formatting of multiple records (that the engine
+does not handle, as it works on a single record basis) should be put,
+with name create_*.
SEE: bibformat_utils.py
FIXME: currently copies record_exists() code from search engine. Refactor later.
"""
__revision__ = "$Id$"
import zlib
from invenio import bibformat_dblayer
from invenio import bibformat_engine
from invenio import bibformat_utils
from invenio.config import cdslang, weburl, CFG_PATH_PHP
from invenio.bibformat_config import CFG_BIBFORMAT_USE_OLD_BIBFORMAT
try:
import invenio.template
websearch_templates = invenio.template.load('websearch')
except:
pass
import getopt
import sys
# Functions to format a single record
##
-def format_record(recID, of, ln=cdslang, verbose=0, search_pattern=[], xml_record=None, uid=None, on_the_fly=False):
+def format_record(recID, of, ln=cdslang, verbose=0, search_pattern=[],
+ xml_record=None, user_info=None, on_the_fly=False):
"""
Formats a record given output format.
- Returns a formatted version of the record in
- the specified language, search pattern, and with the specified output format.
+ Returns a formatted version of the record in the specified
+ language, search pattern, and with the specified output format.
The function will define which format template must be applied.
- The record to be formatted can be specified with its ID (with 'recID' parameter) or given
- as XML representation(with 'xml_record' parameter). If both are specified 'recID' is ignored.
+ The record to be formatted can be specified with its ID (with
+ 'recID' parameter) or given as XML representation(with
+ 'xml_record' parameter). If both are specified 'recID' is ignored.
- 'uid' allows to grant access to some functionalities on a page depending
- on the user's priviledges. Typically use webuser.getUid(req). This uid has sense
- only in the case of on-the-fly formatting.
+ 'user_info' allows to grant access to some functionalities on a
+ page depending on the user's priviledges. The 'user_info' object
+ makes sense only in the case of on-the-fly formatting. 'user_info'
+ is the same object as the one returned by
+ 'webuser.collect_user_info(req)'
@param recID the ID of record to format
@param of an output format code (or short identifier for the output format)
@param ln the language to use to format the record
@param verbose the level of verbosity from 0 to 9 (O: silent,
5: errors,
7: errors and warnings, stop if error in format elements
9: errors and warnings, stop if error (debug mode ))
@param search_pattern list of strings representing the user request in web interface
@param xml_record an xml string represention of the record to format
- @param uid the user id of the person who will view the formatted page (if applicable)
+ @param user_info the information of the user who will view the formatted page (if applicable)
@param on_the_fly if False, try to return an already preformatted version of the record in the database
@return formatted record
"""
out = ""
if verbose == 9:
out += """\n
Formatting record %i with output format %s.
""" % (recID, of)
############### FIXME: REMOVE WHEN MIGRATION IS DONE ###############
if CFG_BIBFORMAT_USE_OLD_BIBFORMAT and CFG_PATH_PHP:
return bibformat_engine.call_old_bibformat(recID, format=of, on_the_fly=on_the_fly)
############################# END ##################################
if not on_the_fly and \
(ln==cdslang or CFG_BIBFORMAT_USE_OLD_BIBFORMAT):
# Try to fetch preformatted record
# Only possible for records formatted in cdslang language (other are never stored)
res = bibformat_dblayer.get_preformatted_record(recID, of)
if res is not None:
# record 'recID' is formatted in 'of', so return it
if verbose == 9:
last_updated = bibformat_dblayer.get_preformatted_record_date(recID, of)
out += """\n
Found preformatted output for record %i (cache updated on %s).
""" % (recID, last_updated)
out += res
return out
else:
if verbose == 9:
out+= """\n
No preformatted output found for record %s.
"""% recID
# Live formatting of records in all other cases
if verbose == 9:
out+= """\n
Formatting record %i on-the-fly.
""" % recID
try:
out += bibformat_engine.format_record(recID=recID,
of=of,
ln=ln,
verbose=verbose,
search_pattern=search_pattern,
xml_record=xml_record,
- uid=uid)
+ user_info=user_info)
return out
except Exception, e:
#Failsafe execution mode
if verbose == 9:
out+= """\n
An error occured while formatting record %i. (%s)
""" % (recID, str(e))
if of.lower() == 'hd':
if verbose == 9:
out+= """\n
Formatting record %i with websearch_templates.tmpl_print_record_detailed.
""" % recID
return out + websearch_templates.tmpl_print_record_detailed(
ln = ln,
recID = recID,
weburl = weburl,
)
if verbose == 9:
out+= """\n
Formatting record %i with websearch_templates.tmpl_print_record_brief.
""" % recID
return out + websearch_templates.tmpl_print_record_brief(ln = ln,
recID = recID,
weburl = weburl,
)
def record_get_xml(recID, format='xm', decompress=zlib.decompress):
"""
Returns an XML string of the record given by recID.
The function builds the XML directly from the database,
without using the standard formatting process.
'format' allows to define the flavour of XML:
- 'xm' for standard XML
- 'marcxml' for MARC XML
- 'oai_dc' for OAI Dublin Core
- 'xd' for XML Dublin Core
If record does not exist, returns empty string.
@param recID the id of the record to retrieve
@return the xml string of the record
"""
return bibformat_utils.record_get_xml(recID=recID, format=format, decompress=decompress)
# Helper functions to do complex formatting of multiple records
#
# You should not modify format_records when adding a complex
# formatting of multiple records, but add a create_* method
# that relies on format_records to do the formatting.
##
-def format_records(recIDs, of, ln=cdslang, verbose=0, search_pattern=None, xml_records=None, uid=None,
- record_prefix=None, record_separator=None, record_suffix=None,
- prologue="", epilogue="", req=None, on_the_fly=False):
+def format_records(recIDs, of, ln=cdslang, verbose=0, search_pattern=None,
+ xml_records=None, user_info=None, record_prefix=None,
+ record_separator=None, record_suffix=None, prologue="",
+ epilogue="", req=None, on_the_fly=False):
"""
- Returns a list of formatted records given by a list of record IDs or a list of records as xml.
- Adds a prefix before each record, a suffix after each record, plus a separator between records.
-
- Also add optional prologue and epilogue to the complete formatted list.
-
- You can either specify a list of record IDs to format, or a list of xml records,
- but not both (if both are specified recIDs is ignored).
-
- 'record_separator' is a function that returns a string as separator between records.
- The function must take an integer as unique parameter, which is the index
- in recIDs (or xml_records) of the record that has just been formatted. For example
- separator(i) must return the separator between recID[i] and recID[i+1].
- Alternatively separator can be a single string, which will be used to separate
- all formatted records.
- The same applies to 'record_prefix' and 'record_suffix'.
+ Returns a list of formatted records given by a list of record IDs
+ or a list of records as xml. Adds a prefix before each record, a
+ suffix after each record, plus a separator between records.
+
+ Also add optional prologue and epilogue to the complete formatted
+ list.
+
+ You can either specify a list of record IDs to format, or a list
+ of xml records, but not both (if both are specified recIDs is
+ ignored).
+
+ 'record_separator' is a function that returns a string as
+ separator between records. The function must take an integer as
+ unique parameter, which is the index in recIDs (or xml_records) of
+ the record that has just been formatted. For example separator(i)
+ must return the separator between recID[i] and recID[i+1].
+ Alternatively separator can be a single string, which will be used
+ to separate all formatted records. The same applies to
+ 'record_prefix' and 'record_suffix'.
'req' is an optional parameter on which the result of the function
are printed lively (prints records after records) if it is given.
- Note that you should set 'req' content-type by yourself, and send http header before calling
- this function as it will not do it.
+ Note that you should set 'req' content-type by yourself, and send
+ http header before calling this function as it will not do it.
This function takes the same parameters as 'format_record' except for:
@param recIDs a list of record IDs
@param xml_records a list of xml string representions of the records to format
@param header a string printed before all formatted records
@param separator either a string or a function that returns string to separate formatted records
@param req an optional request object where to print records
@param on_the_fly if False, try to return an already preformatted version of the record in the database
"""
if req is not None:
req.write(prologue)
formatted_records = ''
#Fill one of the lists with Nones
if xml_records is not None:
recIDs = map(lambda x:None, xml_records)
else:
xml_records = map(lambda x:None, recIDs)
total_rec = len(recIDs)
last_iteration = False
for i in range(total_rec):
if i == total_rec - 1:
last_iteration = True
#Print prefix
if record_prefix is not None:
if isinstance(record_prefix, str):
formatted_records += record_prefix
if req is not None:
req.write(record_prefix)
else:
string_prefix = record_prefix(i)
formatted_records += string_prefix
if req is not None:
req.write(string_prefix)
#Print formatted record
- formatted_record = format_record(recIDs[i], of, ln, verbose, search_pattern, xml_records[i], uid, on_the_fly)
+ formatted_record = format_record(recIDs[i], of, ln, verbose, \
+ search_pattern, xml_records[i],\
+ user_info, on_the_fly)
formatted_records += formatted_record
if req is not None:
req.write(formatted_record)
#Print suffix
if record_suffix is not None:
if isinstance(record_suffix, str):
formatted_records += record_suffix
if req is not None:
req.write(record_suffix)
else:
string_suffix = record_suffix(i)
formatted_records += string_suffix
if req is not None:
req.write(string_suffix)
#Print separator if needed
if record_separator is not None and not last_iteration:
if isinstance(record_separator, str):
formatted_records += record_separator
if req is not None:
req.write(record_separator)
else:
string_separator = record_separator(i)
formatted_records += string_separator
if req is not None:
req.write(string_separator)
if req is not None:
req.write(epilogue)
return prologue + formatted_records + epilogue
def create_excel(recIDs, req=None, ln=cdslang):
"""
Returns an Excel readable format containing the given recIDs.
If 'req' is given, also prints the output in 'req' while individual
records are being formatted.
This method shows how to create a custom formatting of multiple
records.
The excel format is a basic HTML table that most spreadsheets
applications can parse.
@param recIDs a list of record IDs
@return a string in Excel format
"""
# Prepare the column headers to display in the Excel file
column_headers_list = ['Title',
'Authors',
'Addresses',
'Affiliation',
'Date',
'Publisher',
'Place',
'Abstract',
'Keywords',
'Notes']
# Prepare Content
column_headers = '
'
#Apply content_type and print column headers
if req is not None:
req.content_type = get_output_format_content_type('excel')
req.headers_out["Content-Disposition"] = "inline; filename=%s" % 'results.xls'
req.send_http_header()
#Format the records
excel_formatted_records = format_records(recIDs, 'excel', ln=cdslang,
record_separator='\n',
prologue = '
',
epilogue = footer,
req=req)
return excel_formatted_records
# Utility functions
##
def get_output_format_content_type(of):
"""
Returns the content type (eg. 'text/html' or 'application/ms-excel') \
of the given output format.
@param of the code of output format for which we want to get the content type
"""
content_type = bibformat_dblayer.get_output_format_content_type(of)
if content_type == '':
content_type = 'text/html'
return content_type
def usage(exitcode=1, msg=""):
"""Prints usage info."""
if msg:
sys.stderr.write("Error: %s.\n" % msg)
print """BibFormat: outputs the result of the formatting of a record.
Usage: bibformat required [options]
Examples:
$ bibformat -i 10 -o HB
$ bibformat -i 10,11,13 -o HB
$ bibformat -i 10:13
$ bibformat -i 10 -o HB -v 9
Required:
-i, --id=ID[ID2,ID3:ID5] ID (or range of IDs) of the record(s) to be formatted.
Options:
-o, --output=CODE short code of the output format used for formatting (default HB).
-l, --lang=LN language used for formatting.
-y, --onthefly on-the-fly formatting, avoiding caches created by BibReformat.
General options:
-h, --help print this help and exit
-v, --verbose=LEVEL verbose level (from 0 to 9, default 0)
-V --version print the script version
"""
sys.exit(exitcode)
def main():
"""main entry point for biformat via command line"""
options = {} # will hold command-line options
options["verbose"] = 0
options["onthefly"] = False
options["lang"] = cdslang
options["output"] = "HB"
options["recID"] = None
try:
opts, args = getopt.getopt(sys.argv[1:],
"hVv:yl:i:o:",
["help",
"version",
"verbose=",
"onthefly",
"lang=",
"id=",
"output="])
except getopt.GetoptError, err:
usage(1, err)
pass
try:
for opt in opts:
if opt[0] in ["-h", "--help"]:
usage(0)
elif opt[0] in ["-V", "--version"]:
print __revision__
sys.exit(0)
elif opt[0] in ["-v", "--verbose"]:
options["verbose"] = int(opt[1])
elif opt[0] in ["-y", "--onthefly"]:
options["onthefly"] = True
elif opt[0] in ["-l", "--lang"]:
options["lang"] = opt[1]
elif opt[0] in ["-i", "--id"]:
recIDs = []
for recID in opt[1].split(','):
if ":" in recID:
start = int(recID.split(':')[0])
end = int(recID.split(':')[1])
recIDs.extend(range(start, end))
else:
recIDs.append(int(recID))
options["recID"] = recIDs
elif opt[0] in ["-o", "--output"]:
options["output"] = opt[1]
if options["recID"] == None:
usage(1, "-i argument is needed")
except StandardError, e:
usage(e)
print format_records(recIDs=options["recID"],
of=options["output"],
ln=options["lang"],
verbose=options["verbose"],
on_the_fly=options["onthefly"])
return
if __name__ == "__main__":
main()
diff --git a/modules/bibformat/lib/bibformat_engine.py b/modules/bibformat/lib/bibformat_engine.py
index 50a56dba0..ef5fbf90f 100644
--- a/modules/bibformat/lib/bibformat_engine.py
+++ b/modules/bibformat/lib/bibformat_engine.py
@@ -1,1973 +1,1991 @@
# -*- coding: utf-8 -*-
##
## $Id$
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
Formats a single XML Marc record using specified format.
There is no API for the engine. Instead use bibformat.py.
SEE: bibformat.py, bibformat_utils.py
"""
__revision__ = "$Id$"
import re
import sys
import os
import inspect
import traceback
import zlib
import cgi
from invenio.config import \
CFG_PATH_PHP, \
bindir, \
cdslang
from invenio.errorlib import \
register_errors, \
get_msgs_for_code_list
from invenio.bibrecord import \
create_record, \
record_get_field_instances, \
record_get_field_value, \
record_get_field_values
from invenio.bibformat_xslt_engine import format
from invenio.dbquery import run_sql
from invenio.messages import \
language_list_long, \
wash_language
from invenio import bibformat_dblayer
from invenio.bibformat_config import \
CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION, \
CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION, \
CFG_BIBFORMAT_TEMPLATES_PATH, \
CFG_BIBFORMAT_ELEMENTS_PATH, \
CFG_BIBFORMAT_OUTPUTS_PATH, \
CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH
from invenio.bibformat_utils import \
record_get_xml, \
parse_tag
from invenio.htmlutils import HTMLWasher
+from invenio.webuser import collect_user_info
if CFG_PATH_PHP: #Remove when call_old_bibformat is removed
from xml.dom import minidom
import tempfile
# Cache for data we have already read and parsed
format_templates_cache = {}
format_elements_cache = {}
format_outputs_cache = {}
kb_mappings_cache = {}
cdslangs = language_list_long()
html_field = '' # String indicating that field should be
# treated as HTML (and therefore no escaping of
# HTML tags should occur.
# Appears in some field values.
washer = HTMLWasher() # Used to remove dangerous tags from HTML
# sources
# Regular expression for finding ... tag in format templates
pattern_lang = re.compile(r'''
#closing start tag
(?P.*?) #anything but the next group (greedy)
() #end tag
''', re.IGNORECASE | re.DOTALL | re.VERBOSE)
# Builds regular expression for finding each known language in tags
ln_pattern_text = r"<("
for lang in cdslangs:
ln_pattern_text += lang[0] +r"|"
ln_pattern_text = ln_pattern_text.rstrip(r"|")
ln_pattern_text += r")>(.*?)\1>"
ln_pattern = re.compile(ln_pattern_text, re.IGNORECASE | re.DOTALL)
# Regular expression for finding tag in format templates
pattern_format_template_name = re.compile(r'''
#closing start tag
(?P.*?) #name value. any char that is not end tag
()(\n)? #end tag
''', re.IGNORECASE | re.DOTALL | re.VERBOSE)
# Regular expression for finding tag in format templates
pattern_format_template_desc = re.compile(r'''
#closing start tag
(?P.*?) #description value. any char that is not end tag
(\n)? #end tag
''', re.IGNORECASE | re.DOTALL | re.VERBOSE)
# Regular expression for finding tags in format templates
pattern_tag = re.compile(r'''
[^/\s]+) #any char but a space or slash
\s* #any number of spaces
(?P(\s* #params here
(?P([^=\s])*)\s* #param name: any chars that is not a white space or equality. Followed by space(s)
=\s* #equality: = followed by any number of spaces
(?P[\'"]) #one of the separators
(?P.*?) #param value: any chars that is not a separator like previous one
(?P=sep) #same separator as starting one
)*) #many params
\s* #any number of spaces
(/)?> #end of the tag
''', re.IGNORECASE | re.DOTALL | re.VERBOSE)
# Regular expression for finding params inside tags in format templates
pattern_function_params = re.compile('''
(?P([^=\s])*)\s* # Param name: any chars that is not a white space or equality. Followed by space(s)
=\s* # Equality: = followed by any number of spaces
(?P[\'"]) # One of the separators
(?P.*?) # Param value: any chars that is not a separator like previous one
(?P=sep) # Same separator as starting one
''', re.VERBOSE | re.DOTALL )
# Regular expression for finding format elements "params" attributes
# (defined by @param)
pattern_format_element_params = re.compile('''
@param\s* # Begins with @param keyword followed by space(s)
(?P[^\s=]*)\s* # A single keyword, and then space(s)
#(=\s*(?P[\'"]) # Equality, space(s) and then one of the separators
#(?P.*?) # Default value: any chars that is not a separator like previous one
#(?P=sep) # Same separator as starting one
#)?\s* # Default value for param is optional. Followed by space(s)
(?P.*) # Any text that is not end of line (thanks to MULTILINE parameter)
''', re.VERBOSE | re.MULTILINE)
# Regular expression for finding format elements "see also" attribute
# (defined by @see)
pattern_format_element_seealso = re.compile('''@see\s*(?P.*)''',
re.VERBOSE | re.MULTILINE)
#Regular expression for finding 2 expressions in quotes, separated by
#comma (as in template("1st","2nd") )
#Used when parsing output formats
## pattern_parse_tuple_in_quotes = re.compile('''
## (?P[\'"])
## (?P.*)
## (?P=sep1)
## \s*,\s*
## (?P[\'"])
## (?P.*)
## (?P=sep2)
## ''', re.VERBOSE | re.MULTILINE)
def call_old_bibformat(recID, format="HD", on_the_fly=False, verbose=0):
"""
FIXME: REMOVE FUNCTION WHEN MIGRATION IS DONE
Calls BibFormat for the record RECID in the desired output format FORMAT.
@param on_the_fly if False, try to return an already preformatted version of the record in the database
Note: this functions always try to return HTML, so when
bibformat returns XML with embedded HTML format inside the tag
FMT $g, as is suitable for prestoring output formats, we
perform un-XML-izing here in order to return HTML body only.
"""
out = ""
res = []
if not on_the_fly:
# look for formatted notice existence:
query = "SELECT value, last_updated FROM bibfmt WHERE "\
"id_bibrec='%s' AND format='%s'" % (recID, format)
res = run_sql(query, None, 1)
if res:
# record 'recID' is formatted in 'format', so print it
if verbose == 9:
last_updated = res[0][1]
out += """\n
Found preformatted output for record %i (cache updated on %s).
""" % (recID, last_updated)
decompress = zlib.decompress
return "%s" % decompress(res[0][0])
else:
# record 'recID' is not formatted in 'format',
# so try to call BibFormat on the fly or use default format:
if verbose == 9:
out += """\n
Formatting record %i on-the-fly with old BibFormat.
""" % recID
# Retrieve MARCXML
# Build it on-the-fly only if 'call_old_bibformat' was called
# with format=xm and on_the_fly=True
xm_record = record_get_xml(recID, 'xm',
on_the_fly=(on_the_fly and format == 'xm'))
## import platform
## # Some problem have been found using either popen or os.system command.
## # Here is a temporary workaround until the issue is solved.
## if platform.python_compiler().find('Red Hat') > -1:
## # use os.system
## (result_code, result_path) = tempfile.mkstemp()
## command = "( %s/bibformat otype=%s ) > %s" % (bindir, format, result_path)
## (xm_code, xm_path) = tempfile.mkstemp()
## xm_file = open(xm_path, "w")
## xm_file.write(xm_record)
## xm_file.close()
## command = command + " <" + xm_path
## os.system(command)
## result_file = open(result_path,"r")
## bibformat_output = result_file.read()
## result_file.close()
## os.remove(result_path)
## os.remove(xm_path)
## else:
## # use popen
pipe_input, pipe_output, pipe_error = os.popen3(["%s/bibformat" % bindir,
"otype=%s" % format],
'rw')
pipe_input.write(xm_record)
pipe_input.flush()
pipe_input.close()
bibformat_output = pipe_output.read()
pipe_output.close()
pipe_error.close()
if bibformat_output.startswith(""):
dom = minidom.parseString(bibformat_output)
for e in dom.getElementsByTagName('subfield'):
if e.getAttribute('code') == 'g':
for t in e.childNodes:
out += t.data.encode('utf-8')
else:
out += bibformat_output
return out
def format_record(recID, of, ln=cdslang, verbose=0,
- search_pattern=[], xml_record=None, uid=None):
+ search_pattern=[], xml_record=None, user_info=None):
"""
- Formats a record given output format. Main entry function of bibformat engine.
+ Formats a record given output format. Main entry function of
+ bibformat engine.
- Returns a formatted version of the record in
- the specified language, search pattern, and with the specified output format.
+ Returns a formatted version of the record in the specified
+ language, search pattern, and with the specified output format.
The function will define which format template must be applied.
- You can either specify an record ID to format, or give its xml representation.
- if 'xml_record' is not None, then use it instead of recID.
+ You can either specify an record ID to format, or give its xml
+ representation. if 'xml_record' is not None, then use it instead
+ of recID.
- 'uid' allows to grant access to some functionalities on a page depending
- on the user's priviledges.
+ 'user_info' allows to grant access to some functionalities on a
+ page depending on the user's priviledges. 'user_info' is the same
+ object as the one returned by 'webuser.collect_user_info(req)'
@param recID the ID of record to format
@param of an output format code (or short identifier for the output format)
@param ln the language to use to format the record
@param verbose the level of verbosity from 0 to 9 (O: silent,
5: errors,
7: errors and warnings, stop if error in format elements
9: errors and warnings, stop if error (debug mode ))
@param search_pattern list of strings representing the user request in web interface
@param xml_record an xml string representing the record to format
- @param uid the user id of the person who will view the formatted page
+ @param user_info the information of the user who will view the formatted page
@return formatted record
"""
out = ""
errors_ = []
# Temporary workflow (during migration of formats):
# Call new BibFormat
# But if format not found for new BibFormat, then call old BibFormat
#Create a BibFormat Object to pass that contain record and context
- bfo = BibFormatObject(recID, ln, search_pattern, xml_record, uid, of)
+ bfo = BibFormatObject(recID, ln, search_pattern, xml_record, user_info, of)
#Find out which format template to use based on record and output format.
template = decide_format_template(bfo, of)
if verbose == 9 and template is not None:
out += """\n
Using %s template for record %i.
""" % (template, recID)
############### FIXME: REMOVE WHEN MIGRATION IS DONE ###############
path = "%s%s%s" % (CFG_BIBFORMAT_TEMPLATES_PATH, os.sep, template)
if template is None or not os.access(path, os.R_OK):
# template not found in new BibFormat. Call old one
if verbose == 9:
if template is None:
out += """\n
No template found for output format %s and record %i.
(Check invenio.err log file for more details)
""" % (of, recID)
else:
out += """\n
Template %s could not be read.
""" % (template)
if CFG_PATH_PHP:
if verbose == 9:
out += """\n
Using old BibFormat for record %s.
""" % recID
return out + call_old_bibformat(recID, format=of, on_the_fly=True, verbose=verbose)
############################# END ##################################
error = get_msgs_for_code_list([("ERR_BIBFORMAT_NO_TEMPLATE_FOUND", of)],
stream='error', ln=cdslang)
errors_.append(error)
if verbose == 0:
register_errors(error, 'error')
elif verbose > 5:
return out + error[0][1]
return out
# Format with template
(out_, errors) = format_with_format_template(template, bfo, verbose)
errors_.extend(errors)
out += out_
return out
def decide_format_template(bfo, of):
"""
Returns the format template name that should be used for formatting
given output format and BibFormatObject.
Look at of rules, and take the first matching one.
If no rule matches, returns None
To match we ignore lettercase and spaces before and after value of
rule and value of record
@param bfo a BibFormatObject
@param of the code of the output format to use
"""
output_format = get_output_format(of)
for rule in output_format['rules']:
value = bfo.field(rule['field']).strip()#Remove spaces
pattern = rule['value'].strip() #Remove spaces
match_obj = re.match(pattern, value, re.IGNORECASE)
if match_obj is not None and \
match_obj.start() == 0 and match_obj.end() == len(value):
return rule['template']
template = output_format['default']
if template != '':
return template
else:
return None
def format_with_format_template(format_template_filename, bfo,
verbose=0, format_template_code=None):
""" Format a record given a
format template. Also returns errors
Returns a formatted version of the record represented by bfo,
in the language specified in bfo, and with the specified format template.
If format_template_code is provided, the template will not be loaded from
format_template_filename (but format_template_filename will still be used to
determine if bft or xsl transformation applies). This allows to preview format
code without having to save file on disk.
@param format_template_filename the dilename of a format template
@param bfo the object containing parameters for the current formatting
@param format_template_code if not empty, use code as template instead of reading format_template_filename (used for previews)
@param verbose the level of verbosity from 0 to 9 (O: silent,
5: errors,
7: errors and warnings,
9: errors and warnings, stop if error (debug mode ))
@return tuple (formatted text, errors)
"""
errors_ = []
if format_template_code is not None:
format_content = str(format_template_code)
else:
format_content = get_format_template(format_template_filename)['code']
if format_template_filename is None or \
format_template_filename.endswith("."+CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION):
# .bft
localized_format = filter_languages(format_content, bfo.lang)
(evaluated_format, errors) = eval_format_template_elements(localized_format,
bfo,
verbose)
errors_ = errors
else:
#.xsl
# Fetch MARCXML. On-the-fly xm if we are now formatting in xm
xml_record = record_get_xml(bfo.recID, 'xm', on_the_fly=(bfo.format != 'xm'))
# Transform MARCXML using stylesheet
evaluated_format = format(xml_record, template_source=format_content)
return (evaluated_format, errors_)
def eval_format_template_elements(format_template, bfo, verbose=0):
"""
Evalutes the format elements of the given template and replace each element with its value.
Also returns errors.
Prepare the format template content so that we can directly replace the marc code by their value.
This implies: 1) Look for special tags
2) replace special tags by their evaluation
@param format_template the format template code
@param bfo the object containing parameters for the current formatting
@param verbose the level of verbosity from 0 to 9 (O: silent,
5: errors,
7: errors and warnings,
9: errors and warnings, stop if error (debug mode ))
@return tuple (result, errors)
"""
errors_ = []
# First define insert_element_code(match), used in re.sub() function
def insert_element_code(match):
"""
Analyses 'match', interpret the corresponding code, and return the result of the evaluation.
Called by substitution in 'eval_format_template_elements(...)'
@param match a match object corresponding to the special tag that must be interpreted
"""
function_name = match.group("function_name")
try:
format_element = get_format_element(function_name, verbose)
except Exception, e:
if verbose >= 5:
return '' + \
cgi.escape(str(e)).replace('\n', ' ') + \
''
if format_element is None:
error = get_msgs_for_code_list([("ERR_BIBFORMAT_CANNOT_RESOLVE_ELEMENT_NAME", function_name)],
stream='error', ln=cdslang)
errors_.append(error)
if verbose >= 5:
return '' + \
error[0][1]+''
else:
params = {}
# Look for function parameters given in format template code
all_params = match.group('params')
if all_params is not None:
function_params_iterator = pattern_function_params.finditer(all_params)
for param_match in function_params_iterator:
name = param_match.group('param')
value = param_match.group('value')
params[name] = value
# Evaluate element with params and return (Do not return errors)
(result, errors) = eval_format_element(format_element,
bfo,
params,
verbose)
errors_.append(errors)
return result
# Substitute special tags in the format by our own text.
# Special tags have the form
format = pattern_tag.sub(insert_element_code, format_template)
return (format, errors_)
def eval_format_element(format_element, bfo, parameters={}, verbose=0):
"""
Returns the result of the evaluation of the given format element
name, with given BibFormatObject and parameters. Also returns
the errors of the evaluation.
@param format_element a format element structure as returned by get_format_element
@param bfo a BibFormatObject used for formatting
@param parameters a dict of parameters to be used for formatting. Key is parameter and value is value of parameter
@param verbose the level of verbosity from 0 to 9 (O: silent,
5: errors,
7: errors and warnings,
9: errors and warnings, stop if error (debug mode ))
@return tuple (result, errors)
"""
errors = []
#Load special values given as parameters
prefix = parameters.get('prefix', "")
suffix = parameters.get('suffix', "")
default_value = parameters.get('default', "")
escape = parameters.get('escape', "")
output_text = ''
# 3 possible cases:
# a) format element file is found: we execute it
# b) format element file is not found, but exist in tag table (e.g. bfe_isbn)
# c) format element is totally unknown. Do nothing or report error
if format_element is not None and format_element['type'] == "python":
# a) We found an element with the tag name, of type "python"
# Prepare a dict 'params' to pass as parameter to 'format'
# function of element
params = {}
# Look for parameters defined in format element
# Fill them with specified default values and values
# given as parameters
for param in format_element['attrs']['params']:
name = param['name']
default = param['default']
params[name] = parameters.get(name, default)
# Add BibFormatObject
params['bfo'] = bfo
# Execute function with given parameters and return result.
function = format_element['code']
try:
output_text = apply(function, (), params)
except Exception, e:
name = format_element['attrs']['name']
error = ("ERR_BIBFORMAT_EVALUATING_ELEMENT", name, str(params))
errors.append(error)
if verbose == 0:
register_errors(errors, 'error')
elif verbose >= 5:
tb = sys.exc_info()[2]
error_string = get_msgs_for_code_list(error,
stream='error',
ln=cdslang)
stack = traceback.format_exception(Exception, e, tb, limit=None)
output_text = ''+ \
str(error_string[0][1]) + "".join(stack) +' '
# None can be returned when evaluating function
if output_text is None:
output_text = ""
else:
output_text = str(output_text)
# Escaping:
# (1) By default, everything is escaped in mode 1
# (2) If evaluated element has 'escape_values()' function, use
# its returned value as escape mode, and override (1)
# (3) If template has a defined parameter (in allowed values),
# use it, and override (1) and (2)
# (1)
escape_mode = 1
# (2)
escape_function = format_element['escape_function']
if escape_function is not None:
try:
escape_mode = apply(escape_function, (), {'bfo': bfo})
except Exception, e:
error = ("ERR_BIBFORMAT_EVALUATING_ELEMENT_ESCAPE", name)
errors.append(error)
if verbose == 0:
register_errors(errors, 'error')
elif verbose >= 5:
tb = sys.exc_info()[2]
error_string = get_msgs_for_code_list(error,
stream='error',
ln=cdslang)
output_text += ''+ \
str(error_string[0][1]) +' '
# (3)
if escape in ['0', '1', '2', '3', '4']:
escape_mode = int(escape)
#If escape is equal to 1, then escape all
# HTML reserved chars.
if escape_mode > 0:
output_text = escape_field(output_text, mode=escape_mode)
# Add prefix and suffix if they have been given as parameters and if
# the evaluation of element is not empty
if output_text.strip() != "":
output_text = prefix + output_text + suffix
# Add the default value if output_text is empty
if output_text == "":
output_text = default_value
return (output_text, errors)
elif format_element is not None and format_element['type'] == "field":
# b) We have not found an element in files that has the tag
# name. Then look for it in the table "tag"
#
#
#
# Load special values given as parameters
separator = parameters.get('separator ', "")
nbMax = parameters.get('nbMax', "")
escape = parameters.get('escape', "1") # By default, escape here
# Get the fields tags that have to be printed
tags = format_element['attrs']['tags']
output_text = []
# Get values corresponding to tags
for tag in tags:
p_tag = parse_tag(tag)
values = record_get_field_values(bfo.get_record(),
p_tag[0],
p_tag[1],
p_tag[2],
p_tag[3])
if len(values)>0 and isinstance(values[0], dict):
#flatten dict to its values only
values_list = map(lambda x: x.values(), values)
#output_text.extend(values)
for values in values_list:
output_text.extend(values)
else:
output_text.extend(values)
if nbMax != "":
try:
nbMax = int(nbMax)
output_text = output_text[:nbMax]
except:
name = format_element['attrs']['name']
error = ("ERR_BIBFORMAT_NBMAX_NOT_INT", name)
errors.append(error)
if verbose < 5:
register_errors(error, 'error')
elif verbose >= 5:
error_string = get_msgs_for_code_list(error,
stream='error',
ln=cdslang)
output_text = output_text.append(error_string[0][1])
# Add prefix and suffix if they have been given as parameters and if
# the evaluation of element is not empty.
# If evaluation is empty string, return default value if it exists.
# Else return empty string
if ("".join(output_text)).strip() != "":
# If escape is equal to 1, then escape all
# HTML reserved chars.
if escape == '1':
output_text = cgi.escape(separator.join(output_text))
else:
output_text = separator.join(output_text)
output_text = prefix + output_text + suffix
else:
#Return default value
output_text = default_value
return (output_text, errors)
else:
# c) Element is unknown
error = get_msgs_for_code_list([("ERR_BIBFORMAT_CANNOT_RESOLVE_ELEMENT_NAME", format_element)],
stream='error', ln=cdslang)
errors.append(error)
if verbose < 5:
register_errors(error, 'error')
return ("", errors)
elif verbose >= 5:
if verbose >= 9:
sys.exit(error[0][1])
return ('' + \
error[0][1]+'', errors)
def filter_languages(format_template, ln='en'):
"""
Filters the language tags that do not correspond to the specified language.
@param format_template the format template code
@param ln the language that is NOT filtered out from the template
@return the format template with unnecessary languages filtered out
"""
# First define search_lang_tag(match) and clean_language_tag(match), used
# in re.sub() function
def search_lang_tag(match):
"""
Searches for the ... tag and remove inner localized tags
such as , , that are not current_lang.
If current_lang cannot be found inside ... , try to use 'cdslang'
@param match a match object corresponding to the special tag that must be interpreted
"""
current_lang = ln
def clean_language_tag(match):
"""
Return tag text content if tag language of match is output language.
Called by substitution in 'filter_languages(...)'
@param match a match object corresponding to the special tag that must be interpreted
"""
if match.group(1) == current_lang:
return match.group(2)
else:
return ""
# End of clean_language_tag
lang_tag_content = match.group("langs")
# Try to find tag with current lang. If it does not exists,
# then current_lang becomes cdslang until the end of this
# replace
pattern_current_lang = re.compile(r"<("+current_lang+ \
r")\s*>(.*?)("+current_lang+r"\s*>)", re.IGNORECASE | re.DOTALL)
if re.search(pattern_current_lang, lang_tag_content) is None:
current_lang = cdslang
cleaned_lang_tag = ln_pattern.sub(clean_language_tag, lang_tag_content)
return cleaned_lang_tag
# End of search_lang_tag
filtered_format_template = pattern_lang.sub(search_lang_tag, format_template)
return filtered_format_template
def get_format_template(filename, with_attributes=False):
"""
Returns the structured content of the given formate template.
if 'with_attributes' is true, returns the name and description. Else 'attrs' is not
returned as key in dictionary (it might, if it has already been loaded previously)
{'code':"Some template code"
'attrs': {'name': "a name", 'description': "a description"}
}
@param filename the filename of an format template
@param with_attributes if True, fetch the attributes (names and description) for format'
@return strucured content of format template
"""
# Get from cache whenever possible
global format_templates_cache
if not filename.endswith("."+CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION) and \
not filename.endswith(".xsl"):
return None
if format_templates_cache.has_key(filename):
# If we must return with attributes and template exist in
# cache with attributes then return cache.
# Else reload with attributes
if with_attributes and \
format_templates_cache[filename].has_key('attrs'):
return format_templates_cache[filename]
format_template = {'code':""}
try:
path = "%s%s%s" % (CFG_BIBFORMAT_TEMPLATES_PATH, os.sep, filename)
format_file = open(path)
format_content = format_file.read()
format_file.close()
# Load format template code
# Remove name and description
if filename.endswith("."+CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION):
code_and_description = pattern_format_template_name.sub("",
format_content)
code = pattern_format_template_desc.sub("", code_and_description)
else:
code = format_content
format_template['code'] = code
except Exception, e:
errors = get_msgs_for_code_list([("ERR_BIBFORMAT_CANNOT_READ_TEMPLATE_FILE", filename, str(e))],
stream='error', ln=cdslang)
register_errors(errors, 'error')
# Save attributes if necessary
if with_attributes:
format_template['attrs'] = get_format_template_attrs(filename)
# Cache and return
format_templates_cache[filename] = format_template
return format_template
def get_format_templates(with_attributes=False):
"""
Returns the list of all format templates, as dictionary with filenames as keys
if 'with_attributes' is true, returns the name and description. Else 'attrs' is not
returned as key in each dictionary (it might, if it has already been loaded previously)
[{'code':"Some template code"
'attrs': {'name': "a name", 'description': "a description"}
},
...
}
@param with_attributes if True, fetch the attributes (names and description) for formats
"""
format_templates = {}
files = os.listdir(CFG_BIBFORMAT_TEMPLATES_PATH)
for filename in files:
if filename.endswith("."+CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION) or \
filename.endswith(".xsl"):
format_templates[filename] = get_format_template(filename,
with_attributes)
return format_templates
def get_format_template_attrs(filename):
"""
Returns the attributes of the format template with given filename
The attributes are {'name', 'description'}
Caution: the function does not check that path exists or
that the format element is valid.
@param the path to a format element
"""
attrs = {}
attrs['name'] = ""
attrs['description'] = ""
try:
template_file = open("%s%s%s" % (CFG_BIBFORMAT_TEMPLATES_PATH,
os.sep,
filename))
code = template_file.read()
template_file.close()
match = None
if filename.endswith(".xsl"):
# .xsl
attrs['name'] = filename[:-4]
else:
# .bft
match = pattern_format_template_name.search(code)
if match is not None:
attrs['name'] = match.group('name')
else:
attrs['name'] = filename
match = pattern_format_template_desc.search(code)
if match is not None:
attrs['description'] = match.group('desc').rstrip('.')
except Exception, e:
errors = get_msgs_for_code_list([("ERR_BIBFORMAT_CANNOT_READ_TEMPLATE_FILE",
filename, str(e))],
stream='error', ln=cdslang)
register_errors(errors, 'error')
attrs['name'] = filename
return attrs
def get_format_element(element_name, verbose=0, with_built_in_params=False):
"""
Returns the format element structured content.
Return None if element cannot be loaded (file not found, not readable or
invalid)
The returned structure is {'attrs': {some attributes in dict. See get_format_element_attrs_from_*}
'code': the_function_code,
'type':"field" or "python" depending if element is defined in file or table,
'escape_function': the function to call to know if element output must be escaped}
@param element_name the name of the format element to load
@param verbose the level of verbosity from 0 to 9 (O: silent,
5: errors,
7: errors and warnings,
9: errors and warnings, stop if error (debug mode ))
@param with_built_in_params if True, load the parameters built in all elements
@return a dictionary with format element attributes
"""
# Get from cache whenever possible
global format_elements_cache
errors = []
# Resolve filename and prepare 'name' as key for the cache
filename = resolve_format_element_filename(element_name)
if filename is not None:
name = filename.upper()
else:
name = element_name.upper()
if format_elements_cache.has_key(name):
element = format_elements_cache[name]
if not with_built_in_params or \
(with_built_in_params and \
element['attrs'].has_key('builtin_params')):
return element
if filename is None:
# Element is maybe in tag table
if bibformat_dblayer.tag_exists_for_name(element_name):
format_element = {'attrs': get_format_element_attrs_from_table( \
element_name,
with_built_in_params),
'code':None,
'escape_function':None,
'type':"field"}
# Cache and returns
format_elements_cache[name] = format_element
return format_element
else:
errors = get_msgs_for_code_list([("ERR_BIBFORMAT_FORMAT_ELEMENT_NOT_FOUND",
element_name)],
stream='error', ln=cdslang)
if verbose == 0:
register_errors(errors, 'error')
elif verbose >= 5:
sys.stderr.write(errors[0][1])
return None
else:
format_element = {}
module_name = filename
if module_name.endswith(".py"):
module_name = module_name[:-3]
# Load element
try:
module = __import__(CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH + \
"." + module_name)
# Load last module in import path
# For eg. load bfe_name in
# invenio.bibformat_elements.bfe_name
# Used to keep flexibility regarding where elements
# directory is (for eg. test cases)
components = CFG_BIBFORMAT_ELEMENTS_IMPORT_PATH.split(".")
for comp in components[1:]:
module = getattr(module, comp)
except Exception, e:
# We catch all exceptions here, as we just want to print
# traceback in all cases
tb = sys.exc_info()[2]
stack = traceback.format_exception(Exception, e, tb, limit=None)
errors = get_msgs_for_code_list([("ERR_BIBFORMAT_IN_FORMAT_ELEMENT",
element_name,"\n" + "\n".join(stack[-2:-1]))],
stream='error', ln=cdslang)
if verbose == 0:
register_errors(errors, 'error')
elif verbose >= 5:
sys.stderr.write(errors[0][1])
if errors:
if verbose >= 7:
raise Exception, errors[0][1]
return None
# Load function 'format()' inside element
try:
function_format = module.__dict__[module_name].format
format_element['code'] = function_format
except AttributeError, e:
errors = get_msgs_for_code_list([("ERR_BIBFORMAT_FORMAT_ELEMENT_FORMAT_FUNCTION",
element_name)],
stream='warning', ln=cdslang)
if verbose == 0:
register_errors(errors, 'error')
elif verbose >= 5:
sys.stderr.write(errors[0][1])
if errors:
if verbose >= 7:
raise Exception, errors[0][1]
return None
# Load function 'escape_values()' inside element
function_escape = getattr(module.__dict__[module_name],
'escape_values',
None)
format_element['escape_function'] = function_escape
# Prepare, cache and return
format_element['attrs'] = get_format_element_attrs_from_function( \
function_format,
element_name,
with_built_in_params)
format_element['type'] = "python"
format_elements_cache[name] = format_element
return format_element
def get_format_elements(with_built_in_params=False):
"""
Returns the list of format elements attributes as dictionary structure
Elements declared in files have priority over element declared in 'tag' table
The returned object has this format:
{element_name1: {'attrs': {'description':..., 'seealso':...
'params':[{'name':..., 'default':..., 'description':...}, ...]
'builtin_params':[{'name':..., 'default':..., 'description':...}, ...]
},
'code': code_of_the_element
},
element_name2: {...},
...}
Returns only elements that could be loaded (not error in code)
@return a dict of format elements with name as key, and a dict as attributes
@param with_built_in_params if True, load the parameters built in all elements
"""
format_elements = {}
mappings = bibformat_dblayer.get_all_name_tag_mappings()
for name in mappings:
format_elements[name.upper().replace(" ", "_").strip()] = get_format_element(name, with_built_in_params=with_built_in_params)
files = os.listdir(CFG_BIBFORMAT_ELEMENTS_PATH)
for filename in files:
filename_test = filename.upper().replace(" ", "_")
if filename_test.endswith(".PY") and filename.upper() != "__INIT__.PY":
if filename_test.startswith("BFE_"):
filename_test = filename_test[4:]
element_name = filename_test[:-3]
element = get_format_element(element_name,
with_built_in_params=with_built_in_params)
if element is not None:
format_elements[element_name] = element
return format_elements
def get_format_element_attrs_from_function(function, element_name,
with_built_in_params=False):
""" Returns the attributes of the
function given as parameter.
It looks for standard parameters of the function, default
values and comments in the docstring.
The attributes are {'description', 'seealso':['element.py', ...],
'params':{name:{'name', 'default', 'description'}, ...], name2:{}}
The attributes are {'name' : "name of element" #basically the name of 'name' parameter
'description': "a string description of the element",
'seealso' : ["element_1.py", "element_2.py", ...] #a list of related elements
'params': [{'name':"param_name", #a list of parameters for this element (except 'bfo')
'default':"default value",
'description': "a description"}, ...],
'builtin_params': {name: {'name':"param_name",#the parameters builtin for all elem of this kind
'default':"default value",
'description': "a description"}, ...},
}
@param function the formatting function of a format element
@param element_name the name of the element
@param with_built_in_params if True, load the parameters built in all elements
"""
attrs = {}
attrs['description'] = ""
attrs['name'] = element_name.replace(" ", "_").upper()
attrs['seealso'] = []
docstring = function.__doc__
if isinstance(docstring, str):
# Look for function description in docstring
#match = pattern_format_element_desc.search(docstring)
description = docstring.split("@param")[0]
description = description.split("@see")[0]
attrs['description'] = description.strip().rstrip('.')
# Look for @see in docstring
match = pattern_format_element_seealso.search(docstring)
if match is not None:
elements = match.group('see').rstrip('.').split(",")
for element in elements:
attrs['seealso'].append(element.strip())
params = {}
# Look for parameters in function definition
(args, varargs, varkw, defaults) = inspect.getargspec(function)
# Prepare args and defaults_list such that we can have a mapping
# from args to defaults
args.reverse()
if defaults is not None:
defaults_list = list(defaults)
defaults_list.reverse()
else:
defaults_list = []
for arg, default in map(None, args, defaults_list):
if arg == "bfo":
#Don't keep this as parameter. It is hidden to users, and
#exists in all elements of this kind
continue
param = {}
param['name'] = arg
if default is None:
#In case no check is made inside element, we prefer to
#print "" (nothing) than None in output
param['default'] = ""
else:
param['default'] = default
param['description'] = "(no description provided)"
params[arg] = param
if isinstance(docstring, str):
# Look for @param descriptions in docstring.
# Add description to existing parameters in params dict
params_iterator = pattern_format_element_params.finditer(docstring)
for match in params_iterator:
name = match.group('name')
if params.has_key(name):
params[name]['description'] = match.group('desc').rstrip('.')
attrs['params'] = params.values()
# Load built-in parameters if necessary
if with_built_in_params:
builtin_params = []
# Add 'prefix' parameter
param_prefix = {}
param_prefix['name'] = "prefix"
param_prefix['default'] = ""
param_prefix['description'] = """A prefix printed only if the
record has a value for this element"""
builtin_params.append(param_prefix)
# Add 'suffix' parameter
param_suffix = {}
param_suffix['name'] = "suffix"
param_suffix['default'] = ""
param_suffix['description'] = """A suffix printed only if the
record has a value for this element"""
builtin_params.append(param_suffix)
# Add 'default' parameter
param_default = {}
param_default['name'] = "default"
param_default['default'] = ""
param_default['description'] = """A default value printed if the
record has no value for this element"""
builtin_params.append(param_default)
# Add 'escape' parameter
param_escape = {}
param_escape['name'] = "escape"
param_escape['default'] = ""
param_escape['description'] = """If set to 1, replaces special
characters '&', '<' and '>' of this
element by SGML entities"""
builtin_params.append(param_escape)
attrs['builtin_params'] = builtin_params
return attrs
def get_format_element_attrs_from_table(element_name,
with_built_in_params=False):
"""
Returns the attributes of the format element with given name in 'tag' table.
Returns None if element_name does not exist in tag table.
The attributes are {'name' : "name of element" #basically the name of 'element_name' parameter
'description': "a string description of the element",
'seealso' : [] #a list of related elements. Always empty in this case
'params': [], #a list of parameters for this element. Always empty in this case
'builtin_params': [{'name':"param_name", #the parameters builtin for all elem of this kind
'default':"default value",
'description': "a description"}, ...],
'tags':["950.1", 203.a] #the list of tags printed by this element
}
@param element_name an element name in database
@param element_name the name of the element
@param with_built_in_params if True, load the parameters built in all elements
"""
attrs = {}
tags = bibformat_dblayer.get_tags_from_name(element_name)
field_label = "field"
if len(tags)>1:
field_label = "fields"
attrs['description'] = "Prints %s %s of the record" % (field_label,
", ".join(tags))
attrs['name'] = element_name.replace(" ", "_").upper()
attrs['seealso'] = []
attrs['params'] = []
attrs['tags'] = tags
# Load built-in parameters if necessary
if with_built_in_params:
builtin_params = []
# Add 'prefix' parameter
param_prefix = {}
param_prefix['name'] = "prefix"
param_prefix['default'] = ""
param_prefix['description'] = """A prefix printed only if the
record has a value for this element"""
builtin_params.append(param_prefix)
# Add 'suffix' parameter
param_suffix = {}
param_suffix['name'] = "suffix"
param_suffix['default'] = ""
param_suffix['description'] = """A suffix printed only if the
record has a value for this element"""
builtin_params.append(param_suffix)
# Add 'separator' parameter
param_separator = {}
param_separator['name'] = "separator"
param_separator['default'] = " "
param_separator['description'] = """A separator between elements of
the field"""
builtin_params.append(param_separator)
# Add 'nbMax' parameter
param_nbMax = {}
param_nbMax['name'] = "nbMax"
param_nbMax['default'] = ""
param_nbMax['description'] = """The maximum number of values to
print for this element. No limit if not
specified"""
builtin_params.append(param_nbMax)
# Add 'default' parameter
param_default = {}
param_default['name'] = "default"
param_default['default'] = ""
param_default['description'] = """A default value printed if the
record has no value for this element"""
builtin_params.append(param_default)
# Add 'escape' parameter
param_escape = {}
param_escape['name'] = "escape"
param_escape['default'] = ""
param_escape['description'] = """If set to 1, replaces special
characters '&', '<' and '>' of this
element by SGML entities"""
builtin_params.append(param_escape)
attrs['builtin_params'] = builtin_params
return attrs
def get_output_format(code, with_attributes=False, verbose=0):
"""
Returns the structured content of the given output format
If 'with_attributes' is true, also returns the names and description of the output formats,
else 'attrs' is not returned in dict (it might, if it has already been loaded previously).
if output format corresponding to 'code' is not found return an empty structure.
See get_output_format_attrs() to learn more on the attributes
{'rules': [ {'field': "980__a",
'value': "PREPRINT",
'template': "filename_a.bft",
},
{...}
],
'attrs': {'names': {'generic':"a name", 'sn':{'en': "a name", 'fr':"un nom"}, 'ln':{'en':"a long name"}}
'description': "a description"
'code': "fnm1",
'content_type': "application/ms-excel",
'visibility': 1
}
'default':"filename_b.bft"
}
@param code the code of an output_format
@param with_attributes if True, fetch the attributes (names and description) for format
@param verbose the level of verbosity from 0 to 9 (O: silent,
5: errors,
7: errors and warnings,
9: errors and warnings, stop if error (debug mode ))
@return strucured content of output format
"""
output_format = {'rules':[], 'default':""}
filename = resolve_output_format_filename(code, verbose)
if filename is None:
errors = get_msgs_for_code_list([("ERR_BIBFORMAT_OUTPUT_FORMAT_CODE_UNKNOWN", code)],
stream='error', ln=cdslang)
register_errors(errors, 'error')
if with_attributes: #Create empty attrs if asked for attributes
output_format['attrs'] = get_output_format_attrs(code, verbose)
return output_format
# Get from cache whenever possible
global format_outputs_cache
if format_outputs_cache.has_key(filename):
# If was must return with attributes but cache has not
# attributes, then load attributes
if with_attributes and not \
format_outputs_cache[filename].has_key('attrs'):
format_outputs_cache[filename]['attrs'] = get_output_format_attrs(code, verbose)
return format_outputs_cache[filename]
try:
if with_attributes:
output_format['attrs'] = get_output_format_attrs(code, verbose)
path = "%s%s%s" % (CFG_BIBFORMAT_OUTPUTS_PATH, os.sep, filename )
format_file = open(path)
current_tag = ''
for line in format_file:
line = line.strip()
if line == "":
# Ignore blank lines
continue
if line.endswith(":"):
# Retrieve tag
# Remove : spaces and eol at the end of line
clean_line = line.rstrip(": \n\r")
# The tag starts at second position
current_tag = "".join(clean_line.split()[1:]).strip()
elif line.find('---') != -1:
words = line.split('---')
template = words[-1].strip()
condition = ''.join(words[:-1])
value = ""
output_format['rules'].append({'field': current_tag,
'value': condition,
'template': template,
})
elif line.find(':') != -1:
# Default case
default = line.split(':')[1].strip()
output_format['default'] = default
except Exception, e:
errors = get_msgs_for_code_list([("ERR_BIBFORMAT_CANNOT_READ_OUTPUT_FILE", filename, str(e))],
stream='error', ln=cdslang)
register_errors(errors, 'error')
# Cache and return
format_outputs_cache[filename] = output_format
return output_format
def get_output_format_attrs(code, verbose=0):
"""
Returns the attributes of an output format.
The attributes contain 'code', which is the short identifier of the output format
(to be given as parameter in format_record function to specify the output format),
'description', a description of the output format, 'visibility' the visibility of
the format in the output format list on public pages and 'names', the localized names
of the output format. If 'content_type' is specified then the search_engine will
send a file with this content type and with result of formatting as content to the user.
The 'names' dict always contais 'generic', 'ln' (for long name) and 'sn' (for short names)
keys. 'generic' is the default name for output format. 'ln' and 'sn' contain long and short
localized names of the output format. Only the languages for which a localization exist
are used.
{'names': {'generic':"a name", 'sn':{'en': "a name", 'fr':"un nom"}, 'ln':{'en':"a long name"}}
'description': "a description"
'code': "fnm1",
'content_type': "application/ms-excel",
'visibility': 1
}
@param code the short identifier of the format
@param verbose the level of verbosity from 0 to 9 (O: silent,
5: errors,
7: errors and warnings,
9: errors and warnings, stop if error (debug mode ))
@return strucured content of output format attributes
"""
if code.endswith("."+CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION):
code = code[:-(len(CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION) + 1)]
attrs = {'names':{'generic':"",
'ln':{},
'sn':{}},
'description':'',
'code':code.upper(),
'content_type':"",
'visibility':1}
filename = resolve_output_format_filename(code, verbose)
if filename is None:
return attrs
attrs['names'] = bibformat_dblayer.get_output_format_names(code)
attrs['description'] = bibformat_dblayer.get_output_format_description(code)
attrs['content_type'] = bibformat_dblayer.get_output_format_content_type(code)
attrs['visibility'] = bibformat_dblayer.get_output_format_visibility(code)
return attrs
def get_output_formats(with_attributes=False):
"""
Returns the list of all output format, as a dictionary with their filename as key
If 'with_attributes' is true, also returns the names and description of the output formats,
else 'attrs' is not returned in dicts (it might, if it has already been loaded previously).
See get_output_format_attrs() to learn more on the attributes
{'filename_1.bfo': {'rules': [ {'field': "980__a",
'value': "PREPRINT",
'template': "filename_a.bft",
},
{...}
],
'attrs': {'names': {'generic':"a name", 'sn':{'en': "a name", 'fr':"un nom"}, 'ln':{'en':"a long name"}}
'description': "a description"
'code': "fnm1"
}
'default':"filename_b.bft"
},
'filename_2.bfo': {...},
...
}
@return the list of output formats
"""
output_formats = {}
files = os.listdir(CFG_BIBFORMAT_OUTPUTS_PATH)
for filename in files:
if filename.endswith("."+CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION):
code = "".join(filename.split(".")[:-1])
output_formats[filename] = get_output_format(code, with_attributes)
return output_formats
def get_kb_mapping(kb, string, default=""):
"""
Returns the value of the string' in the knowledge base 'kb'.
If kb does not exist or string does not exist in kb, returns 'default'
string value.
@param kb a knowledge base name
@param string a key in a knowledge base
@param default a default value if 'string' is not in 'kb'
@return the value corresponding to the given string in given kb
"""
global kb_mappings_cache
if kb_mappings_cache.has_key(kb):
kb_cache = kb_mappings_cache[kb]
if kb_cache.has_key(string):
value = kb_mappings_cache[kb][string]
if value is None:
return default
else:
return value
else:
# Precreate for caching this kb
kb_mappings_cache[kb] = {}
value = bibformat_dblayer.get_kb_mapping_value(kb, string)
kb_mappings_cache[kb][str(string)] = value
if value is None:
return default
else:
return value
def resolve_format_element_filename(string):
"""
Returns the filename of element corresponding to string
This is necessary since format templates code call
elements by ignoring case, for eg. is the
same as .
It is also recommended that format elements filenames are
prefixed with bfe_ . We need to look for these too.
The name of the element has to start with "BFE_".
@param name a name for a format element
@return the corresponding filename, with right case
"""
if not string.endswith(".py"):
name = string.replace(" ", "_").upper() +".PY"
else:
name = string.replace(" ", "_").upper()
files = os.listdir(CFG_BIBFORMAT_ELEMENTS_PATH)
for filename in files:
test_filename = filename.replace(" ", "_").upper()
if test_filename == name or \
test_filename == "BFE_" + name or \
"BFE_" + test_filename == name:
return filename
# No element with that name found
# Do not log error, as it might be a normal execution case:
# element can be in database
return None
def resolve_output_format_filename(code, verbose=0):
"""
Returns the filename of output corresponding to code
This is necessary since output formats names are not case sensitive
but most file systems are.
@param code the code for an output format
@param verbose the level of verbosity from 0 to 9 (O: silent,
5: errors,
7: errors and warnings,
9: errors and warnings, stop if error (debug mode ))
@return the corresponding filename, with right case, or None if not found
"""
#Remove non alphanumeric chars (except .)
code = re.sub(r"[^.0-9a-zA-Z]", "", code)
if not code.endswith("."+CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION):
code = re.sub(r"\W", "", code)
code += "."+CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION
files = os.listdir(CFG_BIBFORMAT_OUTPUTS_PATH)
for filename in files:
if filename.upper() == code.upper():
return filename
# No output format with that name found
errors = get_msgs_for_code_list([("ERR_BIBFORMAT_CANNOT_RESOLVE_OUTPUT_NAME", code)],
stream='error', ln=cdslang)
if verbose == 0:
register_errors(errors, 'error')
elif verbose >= 5:
sys.stderr.write(errors[0][1])
if verbose >= 9:
sys.exit(errors[0][1])
return None
def get_fresh_format_template_filename(name):
"""
Returns a new filename and name for template with given name.
Used when writing a new template to a file, so that the name
has no space, is unique in template directory
Returns (unique_filename, modified_name)
@param a name for a format template
@return the corresponding filename, and modified name if necessary
"""
#name = re.sub(r"\W", "", name) #Remove non alphanumeric chars
name = name.replace(" ", "_")
filename = name
# Remove non alphanumeric chars (except .)
filename = re.sub(r"[^.0-9a-zA-Z]", "", filename)
path = CFG_BIBFORMAT_TEMPLATES_PATH + os.sep + filename \
+ "." + CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION
index = 1
while os.path.exists(path):
index += 1
filename = name + str(index)
path = CFG_BIBFORMAT_TEMPLATES_PATH + os.sep + filename \
+ "." + CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION
if index > 1:
returned_name = (name + str(index)).replace("_", " ")
else:
returned_name = name.replace("_", " ")
return (filename + "." + CFG_BIBFORMAT_FORMAT_TEMPLATE_EXTENSION,
returned_name) #filename.replace("_", " "))
def get_fresh_output_format_filename(code):
"""
Returns a new filename for output format with given code.
Used when writing a new output format to a file, so that the code
has no space, is unique in output format directory. The filename
also need to be at most 6 chars long, as the convention is that
filename == output format code (+ .extension)
We return an uppercase code
Returns (unique_filename, modified_code)
@param code the code of an output format
@return the corresponding filename, and modified code if necessary
"""
#code = re.sub(r"\W", "", code) #Remove non alphanumeric chars
code = code.upper().replace(" ", "_")
# Remove non alphanumeric chars (except .)
code = re.sub(r"[^.0-9a-zA-Z]", "", code)
if len(code) > 6:
code = code[:6]
filename = code
path = CFG_BIBFORMAT_OUTPUTS_PATH + os.sep + filename \
+ "." + CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION
index = 2
while os.path.exists(path):
filename = code + str(index)
if len(filename) > 6:
filename = code[:-(len(str(index)))]+str(index)
index += 1
path = CFG_BIBFORMAT_OUTPUTS_PATH + os.sep + filename \
+ "." + CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION
# We should not try more than 99999... Well I don't see how we
# could get there.. Sanity check.
if index >= 99999:
errors = get_msgs_for_code_list([("ERR_BIBFORMAT_NB_OUTPUTS_LIMIT_REACHED", code)],
stream='error', ln=cdslang)
register_errors(errors, 'error')
sys.exit("Output format cannot be named as %s"%code)
return (filename + "." + CFG_BIBFORMAT_FORMAT_OUTPUT_EXTENSION, filename)
def clear_caches():
"""
Clear the caches (Output Format, Format Templates and Format Elements)
"""
global format_templates_cache, format_elements_cache , \
format_outputs_cache, kb_mappings_cache
format_templates_cache = {}
format_elements_cache = {}
format_outputs_cache = {}
kb_mappings_cache = {}
class BibFormatObject:
"""
An object that encapsulates a record and associated methods, and that is given
as parameter to all format elements 'format' function.
The object is made specifically for a given formatting, i.e. it includes
for example the language for the formatting.
The object provides basic accessors to the record. For full access, one can get
the record with get_record() and then use BibRecord methods on the returned object.
"""
# The record
record = None
# The language in which the formatting has to be done
lang = cdslang
# A list of string describing the context in which the record has
# to be formatted.
# It represents the words of the user request in web interface search
search_pattern = []
# The id of the record
recID = 0
- # The user id of the person who will view the formatted page (if applicable)
- # This allows for example to print a "edit record" link for people
- # who have right to edit a record.
- uid = None
+ uid = None # DEPRECATED: use bfo.user_info['uid'] instead
+
+ # The information about the user, as returned by
+ # 'webuser.collect_user_info(req)'
+ user_info = None
# The format in which the record is being formatted
format = ''
- # The mod_python request object
- req = None
+ req = None # DEPRECATED: use bfo.user_info instead
def __init__(self, recID, ln=cdslang, search_pattern=[],
- xml_record=None, uid=None, format='', req=None):
+ xml_record=None, user_info=None, format=''):
"""
Creates a new bibformat object, with given record.
You can either specify an record ID to format, or give its xml representation.
if 'xml_record' is not None, use 'xml_record' instead of recID for the record.
- 'uid' allows to grant access to some functionalities on a page depending
- on the user's priviledges.
+ 'user_info' allows to grant access to some functionalities on
+ a page depending on the user's priviledges. It is a dictionary
+ in the following form:
+ user_info = {
+ 'remote_ip' : '',
+ 'remote_host' : '',
+ 'referer' : '',
+ 'uri' : '',
+ 'agent' : '',
+ 'apache_user' : '',
+ 'apache_group' : [],
+ 'uid' : -1,
+ 'nickname' : '',
+ 'email' : '',
+ 'group' : [],
+ 'guest' : '1'
+ }
@param recID the id of a record
@param ln the language in which the record has to be formatted
@param search_pattern list of string representing the request used by the user in web interface
@param xml_record a xml string of the record to format
- @param uid the user id of the person who will view the formatted page
+ @param user_info the information of the user who will view the formatted page
@param format the format used for formatting this record
"""
if xml_record is not None:
# If record is given as parameter
self.record = create_record(xml_record)[0]
- # raise repr(create_record(xml_record.decode('utf-8').encode('utf-8')))
- recID = record_get_field_value(self.record,"001")
-
+ recID = record_get_field_value(self.record, "001")
self.lang = wash_language(ln)
self.search_pattern = search_pattern
self.recID = recID
- self.uid = uid
self.format = format
- self.req = req
+ self.user_info = user_info
+ if self.user_info is None:
+ self.user_info = collect_user_info(None)
def get_record(self):
"""
Returns the record of this BibFormatObject instance
@return the record structure as returned by BibRecord
"""
# Create record if necessary
if self.record is None:
# on-the-fly creation if current output is xm
record = create_record(record_get_xml(self.recID, 'xm',
on_the_fly=(self.format.lower() == 'xm')))
self.record = record[0]
return self.record
def control_field(self, tag, escape=0):
"""
Returns the value of control field given by tag in record
@param tag the marc code of a field
@param escape 1 if returned value should be escaped. Else 0.
@return value of field tag in record
"""
if self.get_record() is None:
#Case where BibRecord could not parse object
return ''
p_tag = parse_tag(tag)
field_value = record_get_field_value(self.get_record(),
p_tag[0],
p_tag[1],
p_tag[2],
p_tag[3])
if escape == 0:
return field_value
else:
return escape_field(field_value, escape)
def field(self, tag, escape=0):
"""
Returns the value of the field corresponding to tag in the
current record.
If the value does not exist, return empty string
'escape' parameter allows to escape special characters
of the field. The value of escape can be:
0 - no escaping
1 - escape all HTML characters
2 - escape all HTML characters by default. If field starts with ,
escape only unsafe characters, but leave basic HTML tags.
@param tag the marc code of a field
@param escape 1 if returned value should be escaped. Else 0. (see above for other modes)
@return value of field tag in record
"""
list_of_fields = self.fields(tag)
if len(list_of_fields) > 0:
# Escaping below
if escape == 0:
return list_of_fields[0]
else:
return escape_field(list_of_fields[0], escape)
else:
return ""
def fields(self, tag, escape=0, repeatable_subfields_p=False):
"""
Returns the list of values corresonding to "tag".
If tag has an undefined subcode (such as 999C5),
the function returns a list of dictionaries, whoose keys
are the subcodes and the values are the values of tag.subcode.
If the tag has a subcode, simply returns list of values
corresponding to tag.
Eg. for given MARC:
999C5 $a value_1a $b value_1b
999C5 $b value_2b
999C5 $b value_3b $b value_3b_bis
>> bfo.fields('999C5b')
>> ['value_1b', 'value_2b', 'value_3b', 'value_3b_bis']
>> bfo.fields('999C5')
>> [{'a':'value_1a', 'b':'value_1b'},
{'b':'value_2b'},
{'b':'value_3b'}]
By default the function returns only one value for each
subfield (that is it considers that repeatable subfields are
not allowed). It is why in the above example 'value3b_bis' is
not shown for bfo.fields('999C5'). (Note that it is not
defined which of value_3b or value_3b_bis is returned). This
is to simplify the use of the function, as most of the time
subfields are not repeatable (in that way we get a string
instead of a list). You can allow repeatable subfields by
setting 'repeatable_subfields_p' parameter to True. In
this mode, the above example would return:
>> bfo.fields('999C5b', repeatable_subfields_p=True)
>> ['value_1b', 'value_2b', 'value_3b']
>> bfo.fields('999C5', repeatable_subfields_p=True)
>> [{'a':['value_1a'], 'b':['value_1b']},
{'b':['value_2b']},
{'b':['value_3b', 'value3b_bis']}]
NOTICE THAT THE RETURNED STRUCTURE IS DIFFERENT. Also note
that whatever the value of 'repeatable_subfields_p' is,
bfo.fields('999C5b') always show all fields, even repeatable
ones. This is because the parameter has no impact on the
returned structure (it is always a list).
'escape' parameter allows to escape special characters
of the fields. The value of escape can be:
0 - no escaping
1 - escape all HTML characters
2 - escape all dangerous HTML tags.
3 - Mix of mode 1 and 2. If value of field starts with
, then use mode 2. Else use mode 1.
4 - Remove all HTML tags
@param tag the marc code of a field
@param escape 1 if returned values should be escaped. Else 0.
@repeatable_subfields_p if True, returns the list of subfields in the dictionary
@return values of field tag in record
"""
if self.get_record() is None:
# Case where BibRecord could not parse object
return []
p_tag = parse_tag(tag)
if p_tag[3] != "":
# Subcode has been defined. Simply returns list of values
values = record_get_field_values(self.get_record(),
p_tag[0],
p_tag[1],
p_tag[2],
p_tag[3])
if escape == 0:
return values
else:
return [escape_field(value, escape) for value in values]
else:
# Subcode is undefined. Returns list of dicts.
# However it might be the case of a control field.
instances = record_get_field_instances(self.get_record(),
p_tag[0],
p_tag[1],
p_tag[2])
if repeatable_subfields_p:
list_of_instances = []
for instance in instances:
instance_dict = {}
for subfield in instance[0]:
if not instance_dict.has_key(subfield[0]):
instance_dict[subfield[0]] = []
if escape == 0:
instance_dict[subfield[0]].append(subfield[1])
else:
instance_dict[subfield[0]].append(escape_field(subfield[1], escape))
list_of_instances.append(instance_dict)
return list_of_instances
else:
if escape == 0:
return [dict(instance[0]) for instance in instances]
else:
return [dict([ (subfield[0], escape_field(subfield[1], escape)) \
for subfield in instance[0] ]) \
for instance in instances]
def kb(self, kb, string, default=""):
"""
Returns the value of the "string" in the knowledge base "kb".
If kb does not exist or string does not exist in kb,
returns 'default' string or empty string if not specified.
@param kb a knowledge base name
@param string the string we want to translate
@param default a default value returned if 'string' not found in 'kb'
"""
if string is None:
return default
val = get_kb_mapping(kb, string, default)
if val is None:
return default
else:
return val
def escape_field(value, mode=0):
"""
Utility function used to escape the value of a field in given mode.
- mode 0: no escaping
- mode 1: escaping all HTML/XML characters (escaped chars are shown as escaped)
- mode 2: escaping dangerous HTML tags to avoid XSS, but
keep basic one (such as )
Escaped characters are removed.
- mode 3: mix of mode 1 and mode 2. If field_value starts with ,
then use mode 2. Else use mode 1.
- mode 4: escaping all HTML/XML tags (escaped tags are removed)
-
"""
if mode == 1:
return cgi.escape(value)
elif mode == 2:
return washer.wash(value,
allowed_attribute_whitelist=['href',
'name',
'class']
)
elif mode == 3:
if value.lstrip(' \n').startswith(html_field):
return washer.wash(value,
allowed_attribute_whitelist=['href',
'name',
'class']
)
else:
return cgi.escape(value)
elif mode == 4:
return washer.wash(value,
allowed_attribute_whitelist=[],
allowed_tag_whitelist=[]
)
else:
return value
def bf_profile():
"""
Runs a benchmark
"""
for i in range(1, 51):
format_record(i, "HD", ln=cdslang, verbose=9, search_pattern=[])
return
if __name__ == "__main__":
import profile
import pstats
#bf_profile()
profile.run('bf_profile()', "bibformat_profile")
p = pstats.Stats("bibformat_profile")
p.strip_dirs().sort_stats("cumulative").print_stats()
diff --git a/modules/bibformat/lib/bibformat_regression_tests.py b/modules/bibformat/lib/bibformat_regression_tests.py
index 338a9ebd8..3820d463f 100644
--- a/modules/bibformat/lib/bibformat_regression_tests.py
+++ b/modules/bibformat/lib/bibformat_regression_tests.py
@@ -1,410 +1,410 @@
# -*- coding: utf-8 -*-
## $Id$
##
## This file is part of CDS Invenio.
## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
##
## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""WebSearch module regression tests."""
__revision__ = "$Id$"
import unittest
from invenio.config import weburl, cdslang
from invenio.testutils import make_test_suite, \
warn_user_about_tests_and_run, \
test_web_page_content
from invenio.bibformat import format_record
class BibFormatAPITest(unittest.TestCase):
"""Check BibFormat API"""
def test_basic_formatting(self):
"""bibformat - Checking BibFormat API"""
result = format_record(recID=73,
of='hx',
ln=cdslang,
verbose=0,
search_pattern=[],
xml_record=None,
- uid=None,
+ user_info=None,
on_the_fly=True)
pageurl = weburl + '/record/73?of=hx'
result = test_web_page_content(pageurl,
expected_text=result)
class BibFormatBibTeXTest(unittest.TestCase):
"""Check output produced by BibFormat for BibTeX output for
various records"""
def setUp(self):
"""Prepare some ideal outputs"""
self.record_74_hx = '''
@article{Wang:74,
author = "Wang, B and Lin, C Y and Abdalla, E",
title = "Quasinormal modes of Reissner-Nordstrom Anti-de Sitter
Black Holes",
journal = "Phys. Lett., B",
number = "hep-th/0003295",
volume = "481",
pages = "79-88",
year = "2000",
}
'''
def test_bibtex_output(self):
"""bibformat - BibTeX output"""
pageurl = weburl + '/record/74?of=hx'
result = test_web_page_content(pageurl,
expected_text=self.record_74_hx)
self.assertEqual([], result)
class BibFormatDetailedHTMLTest(unittest.TestCase):
"""Check output produced by BibFormat for detailed HTML ouput for
various records"""
def setUp(self):
"""Prepare some ideal outputs"""
# Record 7 (Article)
self.record_74_hd_header = '''
Published Article / Particle Physics - Theory
hep-th/0003295
'''
self.record_74_hd_title = '''
Quasinormal modes of Reissner-Nordstrom Anti-de Sitter Black Holes
'''
self.record_74_hd_authors = '''Wang, B (Fudan University) ; Lin, C Y ; Abdalla, E '''% \
{'weburl' : weburl,
'lang': cdslang}
self.record_74_hd_abstract = '''Abstract: Complex frequencies associated with quasinormal modes for large Reissner-Nordstr$\ddot{o}$m Anti-de Sitter black holes have been computed. These frequencies have close relation to the black hole charge and do not linearly scale withthe black hole temperature as in Schwarzschild Anti-de Sitter case. In terms of AdS/CFT correspondence, we found that the bigger the black hole charge is, the quicker for the approach to thermal equilibrium in the CFT. The propertiesof quasinormal modes for $l>0$ have also been studied. '''
self.record_74_hd_pubinfo = '''Published in: Phys. Lett., B :481 2000 79-88'''
self.record_74_hd_fulltext = '''0003295.pdf">Cited by: try citation search for hep-th/0003295'''% \
{'weburl' : weburl,
'lang': cdslang}
self.record_74_hd_references = '''
[17] A. Chamblin, R. Emparan, C. V. Johnson and R. C. Myers, Phys. Rev., D60: 104026 (1999) 5070 90 110 130 150 r+ 130 230 330 50 70 90 110 130 150 r+
'''
# Record 7 (Picture)
self.record_7_hd_header = '''
Pictures / Life at CERN
CERN-GE-9806033
'''
self.record_7_hd_title = '''
Tim Berners-Lee
'''
self.record_7_hd_date = '''
28 Jun 1998
'''
self.record_7_hd_abstract = '''
Caption Conference "Internet, Web, What's next?" on 26 June 1998 at CERN : Tim Berners-Lee, inventor of the World-Wide Web and Director of the W3C, explains how the Web came to be and give his views on the future.
Légende Conference "Internet, Web, What's next?" le 26 juin 1998 au CERN: Tim Berners-Lee, inventeur du World-Wide Web et directeur du W3C, explique comment le Web est ne, et donne ses opinions sur l'avenir.
No match close to %s found in given collections.
#Please try different term.
Displaying matches in any collection...""" % p_orig)
## try to get nbhits for these phrases in any collection:
for phrase in browsed_phrases:
browsed_phrases_in_colls.append([phrase, get_nbhits_in_bibxxx(phrase, f)])
## display results now:
out = websearch_templates.tmpl_browse_pattern(
f=f,
fn=get_field_i18nname(f, ln),
ln=ln,
browsed_phrases_in_colls=browsed_phrases_in_colls,
colls=colls,
)
req.write(out)
return
def browse_in_bibwords(req, p, f, ln=cdslang):
"""Browse inside words indexes."""
if not p:
return
_ = gettext_set_language(ln)
urlargd = {}
urlargd.update(req.argd)
urlargd['action'] = 'search'
nearest_box = create_nearest_terms_box(urlargd, p, f, 'w', ln=ln, intro_text_p=0)
req.write(websearch_templates.tmpl_search_in_bibwords(
p = p,
f = f,
ln = ln,
nearest_box = nearest_box
))
return
def search_pattern(req=None, p=None, f=None, m=None, ap=0, of="id", verbose=0, ln=cdslang):
"""Search for complex pattern 'p' within field 'f' according to
matching type 'm'. Return hitset of recIDs.
The function uses multi-stage searching algorithm in case of no
exact match found. See the Search Internals document for
detailed description.
The 'ap' argument governs whether an alternative patterns are to
be used in case there is no direct hit for (p,f,m). For
example, whether to replace non-alphanumeric characters by
spaces if it would give some hits. See the Search Internals
document for detailed description. (ap=0 forbits the
alternative pattern usage, ap=1 permits it.)
The 'of' argument governs whether to print or not some
information to the user in case of no match found. (Usually it
prints the information in case of HTML formats, otherwise it's
silent).
The 'verbose' argument controls the level of debugging information
to be printed (0=least, 9=most).
All the parameters are assumed to have been previously washed.
This function is suitable as a mid-level API.
"""
_ = gettext_set_language(ln)
hitset_empty = HitSet()
# sanity check:
if not p:
hitset_full = HitSet(trailing_bits=1)
hitset_full.discard(0)
# no pattern, so return all universe
return hitset_full
# search stage 1: break up arguments into basic search units:
if verbose and of.startswith("h"):
t1 = os.times()[4]
basic_search_units = create_basic_search_units(req, p, f, m, of)
if verbose and of.startswith("h"):
t2 = os.times()[4]
print_warning(req, "Search stage 1: basic search units are: %s" % basic_search_units)
print_warning(req, "Search stage 1: execution took %.2f seconds." % (t2 - t1))
# search stage 2: do search for each search unit and verify hit presence:
if verbose and of.startswith("h"):
t1 = os.times()[4]
basic_search_units_hitsets = []
for idx_unit in range(0, len(basic_search_units)):
bsu_o, bsu_p, bsu_f, bsu_m = basic_search_units[idx_unit]
basic_search_unit_hitset = search_unit(bsu_p, bsu_f, bsu_m)
if verbose >= 9 and of.startswith("h"):
print_warning(req, "Search stage 1: pattern %s gave hitlist %s" % (bsu_p, list(basic_search_unit_hitset)))
if len(basic_search_unit_hitset) > 0 or \
ap==0 or \
bsu_o=="|" or \
((idx_unit+1) 0:
# we retain the new unit instead
if of.startswith('h'):
print_warning(req, _("No exact match found for %(x_query1)s, using %(x_query2)s instead...") % \
{'x_query1': "" + cgi.escape(bsu_p) + "",
'x_query2': "" + cgi.escape(bsu_pn) + ""})
basic_search_units[idx_unit][1] = bsu_pn
basic_search_units_hitsets.append(basic_search_unit_hitset)
else:
# stage 2-3: no hits found either, propose nearest indexed terms:
if of.startswith('h'):
if req:
if bsu_f == "recid":
print_warning(req, "Requested record does not seem to exist.")
else:
print_warning(req, create_nearest_terms_box(req.argd, bsu_p, bsu_f, bsu_m, ln=ln))
return hitset_empty
else:
# stage 2-3: no hits found either, propose nearest indexed terms:
if of.startswith('h'):
if req:
if bsu_f == "recid":
print_warning(req, "Requested record does not seem to exist.")
else:
print_warning(req, create_nearest_terms_box(req.argd, bsu_p, bsu_f, bsu_m, ln=ln))
return hitset_empty
if verbose and of.startswith("h"):
t2 = os.times()[4]
for idx_unit in range(0, len(basic_search_units)):
print_warning(req, "Search stage 2: basic search unit %s gave %d hits." %
(basic_search_units[idx_unit][1:], len(basic_search_units_hitsets[idx_unit])))
print_warning(req, "Search stage 2: execution took %.2f seconds." % (t2 - t1))
# search stage 3: apply boolean query for each search unit:
if verbose and of.startswith("h"):
t1 = os.times()[4]
# let the initial set be the complete universe:
hitset_in_any_collection = HitSet(trailing_bits=1)
hitset_in_any_collection.discard(0)
for idx_unit in range(0, len(basic_search_units)):
this_unit_operation = basic_search_units[idx_unit][0]
this_unit_hitset = basic_search_units_hitsets[idx_unit]
if this_unit_operation == '+':
hitset_in_any_collection.intersection_update(this_unit_hitset)
elif this_unit_operation == '-':
hitset_in_any_collection.difference_update(this_unit_hitset)
elif this_unit_operation == '|':
hitset_in_any_collection.union_update(this_unit_hitset)
else:
if of.startswith("h"):
print_warning(req, "Invalid set operation %s." % this_unit_operation, "Error")
if len(hitset_in_any_collection) == 0:
# no hits found, propose alternative boolean query:
if of.startswith('h'):
nearestterms = []
for idx_unit in range(0, len(basic_search_units)):
bsu_o, bsu_p, bsu_f, bsu_m = basic_search_units[idx_unit]
if bsu_p.startswith("%") and bsu_p.endswith("%"):
bsu_p = "'" + bsu_p[1:-1] + "'"
bsu_nbhits = len(basic_search_units_hitsets[idx_unit])
# create a similar query, but with the basic search unit only
argd = {}
argd.update(req.argd)
argd['p'] = bsu_p
argd['f'] = bsu_f
nearestterms.append((bsu_p, bsu_nbhits, argd))
text = websearch_templates.tmpl_search_no_boolean_hits(
ln=ln, nearestterms=nearestterms)
print_warning(req, text)
if verbose and of.startswith("h"):
t2 = os.times()[4]
print_warning(req, "Search stage 3: boolean query gave %d hits." % len(hitset_in_any_collection))
print_warning(req, "Search stage 3: execution took %.2f seconds." % (t2 - t1))
return hitset_in_any_collection
def search_unit(p, f=None, m=None):
"""Search for basic search unit defined by pattern 'p' and field
'f' and matching type 'm'. Return hitset of recIDs.
All the parameters are assumed to have been previously washed.
'p' is assumed to be already a ``basic search unit'' so that it
is searched as such and is not broken up in any way. Only
wildcard and span queries are being detected inside 'p'.
This function is suitable as a low-level API.
"""
## create empty output results set:
set = HitSet()
if not p: # sanity checking
return set
if m == 'a' or m == 'r':
# we are doing either direct bibxxx search or phrase search or regexp search
set = search_unit_in_bibxxx(p, f, m)
else:
# we are doing bibwords search by default
set = search_unit_in_bibwords(p, f)
return set
def search_unit_in_bibwords(word, f, decompress=zlib.decompress):
"""Searches for 'word' inside bibwordsX table for field 'f' and returns hitset of recIDs."""
set = HitSet() # will hold output result set
set_used = 0 # not-yet-used flag, to be able to circumvent set operations
# deduce into which bibwordsX table we will search:
stemming_language = get_index_stemming_language(get_index_id_from_field("anyfield"))
bibwordsX = "idxWORD%02dF" % get_index_id_from_field("anyfield")
if f:
index_id = get_index_id_from_field(f)
if index_id:
bibwordsX = "idxWORD%02dF" % index_id
stemming_language = get_index_stemming_language(index_id)
else:
return HitSet() # word index f does not exist
# wash 'word' argument and run query:
word = string.replace(word, '*', '%') # we now use '*' as the truncation character
words = string.split(word, "->", 1) # check for span query
if len(words) == 2:
word0 = re_word.sub('', words[0])
word1 = re_word.sub('', words[1])
if stemming_language:
word0 = stem(word0, stemming_language)
word1 = stem(word1, stemming_language)
res = run_sql("SELECT term,hitlist FROM %s WHERE term BETWEEN %%s AND %%s" % bibwordsX,
(wash_index_term(word0), wash_index_term(word1)))
else:
word = re_word.sub('', word)
if stemming_language:
word = stem(word, stemming_language)
if string.find(word, '%') >= 0: # do we have wildcard in the word?
res = run_sql("SELECT term,hitlist FROM %s WHERE term LIKE %%s" % bibwordsX,
(wash_index_term(word),))
else:
res = run_sql("SELECT term,hitlist FROM %s WHERE term=%%s" % bibwordsX,
(wash_index_term(word),))
# fill the result set:
for word, hitlist in res:
hitset_bibwrd = HitSet(hitlist)
# add the results:
if set_used:
set.union_update(hitset_bibwrd)
else:
set = hitset_bibwrd
set_used = 1
# okay, return result set:
return set
def search_unit_in_bibxxx(p, f, type):
"""Searches for pattern 'p' inside bibxxx tables for field 'f' and returns hitset of recIDs found.
The search type is defined by 'type' (e.g. equals to 'r' for a regexp search)."""
p_orig = p # saving for eventual future 'no match' reporting
query_addons = "" # will hold additional SQL code for the query
query_params = () # will hold parameters for the query (their number may vary depending on TYPE argument)
# wash arguments:
f = string.replace(f, '*', '%') # replace truncation char '*' in field definition
if type == 'r':
query_addons = "REGEXP %s"
query_params = (p,)
else:
p = string.replace(p, '*', '%') # we now use '*' as the truncation character
ps = string.split(p, "->", 1) # check for span query:
if len(ps) == 2:
query_addons = "BETWEEN %s AND %s"
query_params = (ps[0], ps[1])
else:
if string.find(p, '%') > -1:
query_addons = "LIKE %s"
query_params = (ps[0],)
else:
query_addons = "= %s"
query_params = (ps[0],)
# construct 'tl' which defines the tag list (MARC tags) to search in:
tl = []
if str(f[0]).isdigit() and str(f[1]).isdigit():
tl.append(f) # 'f' seems to be okay as it starts by two digits
else:
# convert old ALEPH tag names, if appropriate: (TODO: get rid of this before entering this function)
if CFG_WEBSEARCH_FIELDS_CONVERT.has_key(string.lower(f)):
f = CFG_WEBSEARCH_FIELDS_CONVERT[string.lower(f)]
# deduce desired MARC tags on the basis of chosen 'f'
tl = get_field_tags(f)
if not tl:
# f index does not exist, nevermind
pass
# okay, start search:
l = [] # will hold list of recID that matched
for t in tl:
# deduce into which bibxxx table we will search:
digit1, digit2 = int(t[0]), int(t[1])
bx = "bib%d%dx" % (digit1, digit2)
bibx = "bibrec_bib%d%dx" % (digit1, digit2)
# construct and run query:
if t == "001":
res = run_sql("SELECT id FROM bibrec WHERE id %s" % query_addons,
query_params)
else:
query = "SELECT bibx.id_bibrec FROM %s AS bx LEFT JOIN %s AS bibx ON bx.id=bibx.id_bibxxx WHERE bx.value %s" % \
(bx, bibx, query_addons)
if len(t) != 6 or t[-1:]=='%':
# wildcard query, or only the beginning of field 't'
# is defined, so add wildcard character:
query += " AND bx.tag LIKE %s"
res = run_sql(query, query_params + (t + '%',))
else:
# exact query for 't':
query += " AND bx.tag=%s"
res = run_sql(query, query_params + (t,))
# fill the result set:
for id_bibrec in res:
if id_bibrec[0]:
l.append(id_bibrec[0])
# check no of hits found:
nb_hits = len(l)
# okay, return result set:
set = HitSet(l)
return set
def search_unit_in_bibrec(datetext1, datetext2, type='c'):
"""
Return hitset of recIDs found that were either created or modified
(according to 'type' arg being 'c' or 'm') from datetext1 until datetext2, inclusive.
Does not pay attention to pattern, collection, anything. Useful
to intersect later on with the 'real' query.
"""
set = HitSet()
if type.startswith("m"):
type = "modification_date"
else:
type = "creation_date" # by default we are searching for creation dates
res = run_sql("SELECT id FROM bibrec WHERE %s>=%%s AND %s<=%%s" % (type, type),
(datetext1, datetext2))
for row in res:
set += row[0]
return set
def intersect_results_with_collrecs(req, hitset_in_any_collection, colls, ap=0, of="hb", verbose=0, ln=cdslang):
"""Return dict of hitsets given by intersection of hitset with the collection universes."""
_ = gettext_set_language(ln)
# search stage 4: intersect with the collection universe:
if verbose and of.startswith("h"):
t1 = os.times()[4]
results = {}
results_nbhits = 0
for coll in colls:
results[coll] = hitset_in_any_collection & get_collection_reclist(coll)
results_nbhits += len(results[coll])
if results_nbhits == 0:
# no hits found, try to search in Home:
results_in_Home = hitset_in_any_collection & get_collection_reclist(cdsname)
if len(results_in_Home) > 0:
# some hits found in Home, so propose this search:
if of.startswith("h"):
url = websearch_templates.build_search_url(req.argd, cc=cdsname, c=[])
print_warning(req, _("No match found in collection %(x_collection)s. Other public collections gave %(x_url_open)s%(x_nb_hits)d hits%(x_url_close)s.") %\
{'x_collection': '' + string.join([get_coll_i18nname(coll, ln) for coll in colls], ', ') + '',
'x_url_open': '' % (url),
'x_nb_hits': len(results_in_Home),
'x_url_close': ''})
results = {}
else:
# no hits found in Home, recommend different search terms:
if of.startswith("h"):
print_warning(req, _("No public collection matched your query. "
"If you were looking for a non-public document, please choose "
"the desired restricted collection first."))
results = {}
if verbose and of.startswith("h"):
t2 = os.times()[4]
print_warning(req, "Search stage 4: intersecting with collection universe gave %d hits." % results_nbhits)
print_warning(req, "Search stage 4: execution took %.2f seconds." % (t2 - t1))
return results
def intersect_results_with_hitset(req, results, hitset, ap=0, aptext="", of="hb"):
"""Return intersection of search 'results' (a dict of hitsets
with collection as key) with the 'hitset', i.e. apply
'hitset' intersection to each collection within search
'results'.
If the final 'results' set is to be empty, and 'ap'
(approximate pattern) is true, and then print the `warningtext'
and return the original 'results' set unchanged. If 'ap' is
false, then return empty results set.
"""
if ap:
results_ap = copy.deepcopy(results)
else:
results_ap = {} # will return empty dict in case of no hits found
nb_total = 0
for coll in results.keys():
results[coll].intersection_update(hitset)
nb_total += len(results[coll])
if nb_total == 0:
if of.startswith("h"):
print_warning(req, aptext)
results = results_ap
return results
def create_similarly_named_authors_link_box(author_name, ln=cdslang):
"""Return a box similar to ``Not satisfied...'' one by proposing
author searches for similar names. Namely, take AUTHOR_NAME
and the first initial of the firstame (after comma) and look
into author index whether authors with e.g. middle names exist.
Useful mainly for CERN Library that sometimes contains name
forms like Ellis-N, Ellis-Nick, Ellis-Nicolas all denoting the
same person. The box isn't proposed if no similarly named
authors are found to exist.
"""
# return nothing if not configured:
if CFG_WEBSEARCH_CREATE_SIMILARLY_NAMED_AUTHORS_LINK_BOX == 0:
return ""
# return empty box if there is no initial:
if re.match(r'[^ ,]+, [^ ]', author_name) is None:
return ""
# firstly find name comma initial:
author_name_to_search = re.sub(r'^([^ ,]+, +[^ ,]).*$', '\\1', author_name)
# secondly search for similar name forms:
similar_author_names = {}
for name in author_name_to_search, strip_accents(author_name_to_search):
for tag in get_field_tags("author"):
# deduce into which bibxxx table we will search:
digit1, digit2 = int(tag[0]), int(tag[1])
bx = "bib%d%dx" % (digit1, digit2)
bibx = "bibrec_bib%d%dx" % (digit1, digit2)
if len(tag) != 6 or tag[-1:]=='%':
# only the beginning of field 't' is defined, so add wildcard character:
res = run_sql("""SELECT bx.value FROM %s AS bx
WHERE bx.value LIKE %%s AND bx.tag LIKE %%s""" % bx,
(name + "%", tag + "%"))
else:
res = run_sql("""SELECT bx.value FROM %s AS bx
WHERE bx.value LIKE %%s AND bx.tag=%%s""" % bx,
(name + "%", tag))
for row in res:
similar_author_names[row[0]] = 1
# remove the original name and sort the list:
try:
del similar_author_names[author_name]
except KeyError:
pass
# thirdly print the box:
out = ""
if similar_author_names:
out_authors = similar_author_names.keys()
out_authors.sort()
tmp_authors = []
for out_author in out_authors:
nbhits = get_nbhits_in_bibxxx(out_author, "author")
if nbhits:
tmp_authors.append((out_author, nbhits))
out += websearch_templates.tmpl_similar_author_names(
authors=tmp_authors, ln=ln)
return out
def create_nearest_terms_box(urlargd, p, f, t='w', n=5, ln=cdslang, intro_text_p=True):
"""Return text box containing list of 'n' nearest terms above/below 'p'
for the field 'f' for matching type 't' (words/phrases) in
language 'ln'.
Propose new searches according to `urlargs' with the new words.
If `intro_text_p' is true, then display the introductory message,
otherwise print only the nearest terms in the box content.
"""
# load the right message language
_ = gettext_set_language(ln)
out = ""
nearest_terms = []
if not p: # sanity check
p = "."
# look for nearest terms:
if t == 'w':
nearest_terms = get_nearest_terms_in_bibwords(p, f, n, n)
if not nearest_terms:
return "%s %s." % (_("No words index available for"), get_field_i18nname(f, ln))
else:
nearest_terms = get_nearest_terms_in_bibxxx(p, f, n, n)
if not nearest_terms:
return "%s %s." % (_("No phrase index available for"), get_field_i18nname(f, ln))
terminfo = []
for term in nearest_terms:
if t == 'w':
hits = get_nbhits_in_bibwords(term, f)
else:
hits = get_nbhits_in_bibxxx(term, f)
argd = {}
argd.update(urlargd)
# check which fields contained the requested parameter, and replace it.
for (px, fx) in ('p', 'f'), ('p1', 'f1'), ('p2', 'f2'), ('p3', 'f3'):
if px in argd:
if f == argd[fx] or f == "anyfield" or f == "":
if string.find(argd[px], p) > -1:
argd[px] = string.replace(argd[px], p, term)
break
else:
if string.find(argd[px], f+':'+p) > -1:
argd[px] = string.replace(argd[px], f+':'+p, f+':'+term)
break
elif string.find(argd[px], f+':"'+p+'"') > -1:
argd[px] = string.replace(argd[px], f+':"'+p+'"', f+':"'+term+'"')
break
terminfo.append((term, hits, argd))
intro = ""
if intro_text_p: # add full leading introductory text
if f:
intro = _("Search term %(x_term)s inside index %(x_index)s did not match any record. Nearest terms in any collection are:") % \
{'x_term': "" + cgi.escape(p.startswith("%") and p.endswith("%") and p[1:-1] or p) + "",
'x_index': "" + cgi.escape(get_field_i18nname(f, ln)) + ""}
else:
intro = _("Search term %s did not match any record. Nearest terms in any collection are:") % \
("" + cgi.escape(p.startswith("%") and p.endswith("%") and p[1:-1] or p) + "")
return websearch_templates.tmpl_nearest_term_box(p=p, ln=ln, f=f, terminfo=terminfo,
intro=intro)
def get_nearest_terms_in_bibwords(p, f, n_below, n_above):
"""Return list of +n -n nearest terms to word `p' in index for field `f'."""
nearest_words = [] # will hold the (sorted) list of nearest words to return
# deduce into which bibwordsX table we will search:
bibwordsX = "idxWORD%02dF" % get_index_id_from_field("anyfield")
if f:
index_id = get_index_id_from_field(f)
if index_id:
bibwordsX = "idxWORD%02dF" % index_id
else:
return nearest_words
# firstly try to get `n' closest words above `p':
res = run_sql("SELECT term FROM %s WHERE term<%%s ORDER BY term DESC LIMIT %%s" % bibwordsX,
(p, n_above))
for row in res:
nearest_words.append(row[0])
nearest_words.reverse()
# secondly insert given word `p':
nearest_words.append(p)
# finally try to get `n' closest words below `p':
res = run_sql("SELECT term FROM %s WHERE term>%%s ORDER BY term ASC LIMIT %%s" % bibwordsX,
(p, n_below))
for row in res:
nearest_words.append(row[0])
return nearest_words
def get_nearest_terms_in_bibxxx(p, f, n_below, n_above):
"""Browse (-n_above, +n_below) closest bibliographic phrases
for the given pattern p in the given field f, regardless
of collection.
Return list of [phrase1, phrase2, ... , phrase_n]."""
## determine browse field:
if not f and string.find(p, ":") > 0: # does 'p' contain ':'?
f, p = string.split(p, ":", 1)
## We are going to take max(n_below, n_above) as the number of
## values to ferch from bibXXx. This is needed to work around
## MySQL UTF-8 sorting troubles in 4.0.x. Proper solution is to
## use MySQL 4.1.x or our own idxPHRASE in the future.
n_fetch = 2*max(n_below, n_above)
## construct 'tl' which defines the tag list (MARC tags) to search in:
tl = []
if str(f[0]).isdigit() and str(f[1]).isdigit():
tl.append(f) # 'f' seems to be okay as it starts by two digits
else:
# deduce desired MARC tags on the basis of chosen 'f'
tl = get_field_tags(f)
## start browsing to fetch list of hits:
browsed_phrases = {} # will hold {phrase1: 1, phrase2: 1, ..., phraseN: 1} dict of browsed phrases (to make them unique)
# always add self to the results set:
browsed_phrases[p.startswith("%") and p.endswith("%") and p[1:-1] or p] = 1
for t in tl:
# deduce into which bibxxx table we will search:
digit1, digit2 = int(t[0]), int(t[1])
bx = "bib%d%dx" % (digit1, digit2)
bibx = "bibrec_bib%d%dx" % (digit1, digit2)
# firstly try to get `n' closest phrases above `p':
if len(t) != 6 or t[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character:
res = run_sql("""SELECT bx.value FROM %s AS bx
WHERE bx.value<%%s AND bx.tag LIKE %%s
ORDER BY bx.value DESC LIMIT %%s""" % bx,
(p, t + "%", n_fetch))
else:
res = run_sql("""SELECT bx.value FROM %s AS bx
WHERE bx.value<%%s AND bx.tag=%%s
ORDER BY bx.value DESC LIMIT %%s""" % bx,
(p, t, n_fetch))
for row in res:
browsed_phrases[row[0]] = 1
# secondly try to get `n' closest phrases equal to or below `p':
if len(t) != 6 or t[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character:
res = run_sql("""SELECT bx.value FROM %s AS bx
WHERE bx.value>=%%s AND bx.tag LIKE %%s
ORDER BY bx.value ASC LIMIT %%s""" % bx,
(p, t + "%", n_fetch))
else:
res = run_sql("""SELECT bx.value FROM %s AS bx
WHERE bx.value>=%%s AND bx.tag=%%s
ORDER BY bx.value ASC LIMIT %%s""" % bx,
(p, t, n_fetch))
for row in res:
browsed_phrases[row[0]] = 1
# select first n words only: (this is needed as we were searching
# in many different tables and so aren't sure we have more than n
# words right; this of course won't be needed when we shall have
# one ACC table only for given field):
phrases_out = browsed_phrases.keys()
phrases_out.sort(lambda x, y: cmp(string.lower(strip_accents(x)),
string.lower(strip_accents(y))))
# find position of self:
try:
idx_p = phrases_out.index(p)
except:
idx_p = len(phrases_out)/2
# return n_above and n_below:
return phrases_out[max(0, idx_p-n_above):idx_p+n_below]
def get_nbhits_in_bibwords(word, f):
"""Return number of hits for word 'word' inside words index for field 'f'."""
out = 0
# deduce into which bibwordsX table we will search:
bibwordsX = "idxWORD%02dF" % get_index_id_from_field("anyfield")
if f:
index_id = get_index_id_from_field(f)
if index_id:
bibwordsX = "idxWORD%02dF" % index_id
else:
return 0
if word:
res = run_sql("SELECT hitlist FROM %s WHERE term=%%s" % bibwordsX,
(word,))
for hitlist in res:
out += len(HitSet(hitlist[0]))
return out
def get_nbhits_in_bibxxx(p, f):
"""Return number of hits for word 'word' inside words index for field 'f'."""
## determine browse field:
if not f and string.find(p, ":") > 0: # does 'p' contain ':'?
f, p = string.split(p, ":", 1)
## construct 'tl' which defines the tag list (MARC tags) to search in:
tl = []
if str(f[0]).isdigit() and str(f[1]).isdigit():
tl.append(f) # 'f' seems to be okay as it starts by two digits
else:
# deduce desired MARC tags on the basis of chosen 'f'
tl = get_field_tags(f)
# start searching:
recIDs = {} # will hold dict of {recID1: 1, recID2: 1, ..., } (unique recIDs, therefore)
for t in tl:
# deduce into which bibxxx table we will search:
digit1, digit2 = int(t[0]), int(t[1])
bx = "bib%d%dx" % (digit1, digit2)
bibx = "bibrec_bib%d%dx" % (digit1, digit2)
if len(t) != 6 or t[-1:]=='%': # only the beginning of field 't' is defined, so add wildcard character:
res = run_sql("""SELECT bibx.id_bibrec FROM %s AS bibx, %s AS bx
WHERE bx.value=%%s AND bx.tag LIKE %%s
AND bibx.id_bibxxx=bx.id""" % (bibx, bx),
(p, t + "%"))
else:
res = run_sql("""SELECT bibx.id_bibrec FROM %s AS bibx, %s AS bx
WHERE bx.value=%%s AND bx.tag=%%s
AND bibx.id_bibxxx=bx.id""" % (bibx, bx),
(p, t))
for row in res:
recIDs[row[0]] = 1
return len(recIDs)
def get_mysql_recid_from_aleph_sysno(sysno):
"""Returns DB's recID for ALEPH sysno passed in the argument (e.g. "002379334CER").
Returns None in case of failure."""
out = None
res = run_sql("""SELECT bb.id_bibrec FROM bibrec_bib97x AS bb, bib97x AS b
WHERE b.value=%s AND b.tag='970__a' AND bb.id_bibxxx=b.id""",
(sysno,))
if res:
out = res[0][0]
return out
def guess_primary_collection_of_a_record(recID):
"""Return primary collection name a record recid belongs to, by testing 980 identifier.
May lead to bad guesses when a collection is defined dynamically bia dbquery.
In that case, return 'cdsname'."""
out = cdsname
dbcollids = get_fieldvalues(recID, "980__a")
if dbcollids:
dbquery = "collection:" + dbcollids[0]
res = run_sql("SELECT name FROM collection WHERE dbquery=%s", (dbquery,))
if res:
out = res[0][0]
return out
def get_tag_name(tag_value, prolog="", epilog=""):
"""Return tag name from the known tag value, by looking up the 'tag' table.
Return empty string in case of failure.
Example: input='100__%', output=first author'."""
out = ""
res = run_sql("SELECT name FROM tag WHERE value=%s", (tag_value,))
if res:
out = prolog + res[0][0] + epilog
return out
def get_fieldcodes():
"""Returns a list of field codes that may have been passed as 'search options' in URL.
Example: output=['subject','division']."""
out = []
res = run_sql("SELECT DISTINCT(code) FROM field")
for row in res:
out.append(row[0])
return out
def get_field_tags(field):
"""Returns a list of MARC tags for the field code 'field'.
Returns empty list in case of error.
Example: field='author', output=['100__%','700__%']."""
out = []
query = """SELECT t.value FROM tag AS t, field_tag AS ft, field AS f
WHERE f.code=%s AND ft.id_field=f.id AND t.id=ft.id_tag
ORDER BY ft.score DESC"""
res = run_sql(query, (field, ))
for val in res:
out.append(val[0])
return out
def get_fieldvalues(recID, tag):
"""Return list of field values for field TAG inside record RECID."""
out = []
if tag == "001___":
# we have asked for recID that is not stored in bibXXx tables
out.append(str(recID))
else:
# we are going to look inside bibXXx tables
digits = tag[0:2]
try:
intdigits = int(digits)
if intdigits < 0 or intdigits > 99:
raise ValueError
except ValueError:
# invalid tag value asked for
return []
bx = "bib%sx" % digits
bibx = "bibrec_bib%sx" % digits
query = "SELECT bx.value FROM %s AS bx, %s AS bibx " \
" WHERE bibx.id_bibrec='%s' AND bx.id=bibx.id_bibxxx AND bx.tag LIKE '%s' " \
" ORDER BY bibx.field_number, bx.tag ASC" % (bx, bibx, recID, tag)
res = run_sql(query)
for row in res:
out.append(row[0])
return out
def get_fieldvalues_alephseq_like(recID, tags_in):
"""Return buffer of ALEPH sequential-like textual format with fields found in the list TAGS_IN for record RECID."""
out = ""
if type(tags_in) is not list:
tags_in = [tags_in,]
if len(tags_in) == 1 and len(tags_in[0]) == 6:
## case A: one concrete subfield asked, so print its value if found
## (use with care: can false you if field has multiple occurrences)
out += string.join(get_fieldvalues(recID, tags_in[0]),"\n")
else:
## case B: print our "text MARC" format; works safely all the time
# find out which tags to output:
dict_of_tags_out = {}
if not tags_in:
for i in range(0, 10):
for j in range(0, 10):
dict_of_tags_out["%d%d%%" % (i, j)] = 1
else:
for tag in tags_in:
if len(tag) == 0:
for i in range(0, 10):
for j in range(0, 10):
dict_of_tags_out["%d%d%%" % (i, j)] = 1
elif len(tag) == 1:
for j in range(0, 10):
dict_of_tags_out["%s%d%%" % (tag, j)] = 1
elif len(tag) < 5:
dict_of_tags_out["%s%%" % tag] = 1
elif tag >= 6:
dict_of_tags_out[tag[0:5]] = 1
tags_out = dict_of_tags_out.keys()
tags_out.sort()
# search all bibXXx tables as needed:
for tag in tags_out:
digits = tag[0:2]
try:
intdigits = int(digits)
if intdigits < 0 or intdigits > 99:
raise ValueError
except ValueError:
# invalid tag value asked for
continue
if tag.startswith("001") or tag.startswith("00%"):
if out:
out += "\n"
out += "%09d %s %d" % (recID, "001__", recID)
bx = "bib%sx" % digits
bibx = "bibrec_bib%sx" % digits
query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\
"WHERE bb.id_bibrec='%s' AND b.id=bb.id_bibxxx AND b.tag LIKE '%s%%' "\
"ORDER BY bb.field_number, b.tag ASC" % (bx, bibx, recID, tag)
res = run_sql(query)
# go through fields:
field_number_old = -999
field_old = ""
for row in res:
field, value, field_number = row[0], row[1], row[2]
ind1, ind2 = field[3], field[4]
if ind1 == "_":
ind1 = ""
if ind2 == "_":
ind2 = ""
# print field tag
if field_number != field_number_old or field[:-1] != field_old[:-1]:
if out:
out += "\n"
out += "%09d %s " % (recID, field[:5])
field_number_old = field_number
field_old = field
# print subfield value
if field[0:2] == "00" and field[-1:] == "_":
out += value
else:
out += "$$%s%s" % (field[-1:], value)
return out
def record_exists(recID):
"""Return 1 if record RECID exists.
Return 0 if it doesn't exist.
Return -1 if it exists but is marked as deleted."""
out = 0
query = "SELECT id FROM bibrec WHERE id='%s'" % recID
res = run_sql(query, None, 1)
if res:
# record exists; now check whether it isn't marked as deleted:
dbcollids = get_fieldvalues(recID, "980__%")
if ("DELETED" in dbcollids) or (CFG_CERN_SITE and "DUMMY" in dbcollids):
out = -1 # exists, but marked as deleted
else:
out = 1 # exists fine
return out
def record_public_p(recID):
"""Return 1 if the record is public, i.e. if it can be found in the Home collection.
Return 0 otherwise.
"""
return recID in get_collection_reclist(cdsname)
def get_creation_date(recID, fmt="%Y-%m-%d"):
"Returns the creation date of the record 'recID'."
out = ""
res = run_sql("SELECT DATE_FORMAT(creation_date,%s) FROM bibrec WHERE id=%s", (fmt, recID), 1)
if res:
out = res[0][0]
return out
def get_modification_date(recID, fmt="%Y-%m-%d"):
"Returns the date of last modification for the record 'recID'."
out = ""
res = run_sql("SELECT DATE_FORMAT(modification_date,%s) FROM bibrec WHERE id=%s", (fmt, recID), 1)
if res:
out = res[0][0]
return out
def print_warning(req, msg, type='', prologue=' ', epilogue=' '):
"Prints warning message and flushes output."
if req and msg:
req.write(websearch_templates.tmpl_print_warning(
msg = msg,
type = type,
prologue = prologue,
epilogue = epilogue,
))
return
def print_search_info(p, f, sf, so, sp, rm, of, ot, collection=cdsname, nb_found=-1, jrec=1, rg=10,
as=0, ln=cdslang, p1="", p2="", p3="", f1="", f2="", f3="", m1="", m2="", m3="", op1="", op2="",
sc=1, pl_in_url="",
d1y=0, d1m=0, d1d=0, d2y=0, d2m=0, d2d=0, dt="",
cpu_time=-1, middle_only=0):
"""Prints stripe with the information on 'collection' and 'nb_found' results and CPU time.
Also, prints navigation links (beg/next/prev/end) inside the results set.
If middle_only is set to 1, it will only print the middle box information (beg/netx/prev/end/etc) links.
This is suitable for displaying navigation links at the bottom of the search results page."""
out = ""
# sanity check:
if jrec < 1:
jrec = 1
if jrec > nb_found:
jrec = max(nb_found-rg+1, 1)
return websearch_templates.tmpl_print_search_info(
ln = ln,
weburl = weburl,
collection = collection,
as = as,
collection_name = get_coll_i18nname(collection, ln),
collection_id = get_colID(collection),
middle_only = middle_only,
rg = rg,
nb_found = nb_found,
sf = sf,
so = so,
rm = rm,
of = of,
ot = ot,
p = p,
f = f,
p1 = p1,
p2 = p2,
p3 = p3,
f1 = f1,
f2 = f2,
f3 = f3,
m1 = m1,
m2 = m2,
m3 = m3,
op1 = op1,
op2 = op2,
pl_in_url = pl_in_url,
d1y = d1y,
d1m = d1m,
d1d = d1d,
d2y = d2y,
d2m = d2m,
d2d = d2d,
dt = dt,
jrec = jrec,
sc = sc,
sp = sp,
all_fieldcodes = get_fieldcodes(),
cpu_time = cpu_time,
)
def print_results_overview(req, colls, results_final_nb_total, results_final_nb, cpu_time, ln=cdslang, ec=[]):
"""Prints results overview box with links to particular collections below."""
out = ""
new_colls = []
for coll in colls:
new_colls.append({
'id': get_colID(coll),
'code': coll,
'name': get_coll_i18nname(coll, ln),
})
return websearch_templates.tmpl_print_results_overview(
ln = ln,
weburl = weburl,
results_final_nb_total = results_final_nb_total,
results_final_nb = results_final_nb,
cpu_time = cpu_time,
colls = new_colls,
ec = ec,
)
def sort_records(req, recIDs, sort_field='', sort_order='d', sort_pattern='', verbose=0, of='hb', ln=cdslang):
"""Sort records in 'recIDs' list according sort field 'sort_field' in order 'sort_order'.
If more than one instance of 'sort_field' is found for a given record, try to choose that that is given by
'sort pattern', for example "sort by report number that starts by CERN-PS".
Note that 'sort_field' can be field code like 'author' or MARC tag like '100__a' directly."""
_ = gettext_set_language(ln)
## check arguments:
if not sort_field:
return recIDs
if len(recIDs) > CFG_WEBSEARCH_NB_RECORDS_TO_SORT:
if of.startswith('h'):
print_warning(req, _("Sorry, sorting is allowed on sets of up to %d records only. Using default sort order.") % CFG_WEBSEARCH_NB_RECORDS_TO_SORT, "Warning")
return recIDs
sort_fields = string.split(sort_field, ",")
recIDs_dict = {}
recIDs_out = []
## first deduce sorting MARC tag out of the 'sort_field' argument:
tags = []
for sort_field in sort_fields:
if sort_field and str(sort_field[0:2]).isdigit():
# sort_field starts by two digits, so this is probably a MARC tag already
tags.append(sort_field)
else:
# let us check the 'field' table
query = """SELECT DISTINCT(t.value) FROM tag AS t, field_tag AS ft, field AS f
WHERE f.code='%s' AND ft.id_field=f.id AND t.id=ft.id_tag
ORDER BY ft.score DESC""" % sort_field
res = run_sql(query)
if res:
for row in res:
tags.append(row[0])
else:
if of.startswith('h'):
print_warning(req, _("Sorry, %s does not seem to be a valid sort option. Choosing title sort instead.") % sort_field, "Error")
tags.append("245__a")
if verbose >= 3:
print_warning(req, "Sorting by tags %s." % tags)
if sort_pattern:
print_warning(req, "Sorting preferentially by %s." % sort_pattern)
## check if we have sorting tag defined:
if tags:
# fetch the necessary field values:
for recID in recIDs:
val = "" # will hold value for recID according to which sort
vals = [] # will hold all values found in sorting tag for recID
for tag in tags:
vals.extend(get_fieldvalues(recID, tag))
if sort_pattern:
# try to pick that tag value that corresponds to sort pattern
bingo = 0
for v in vals:
if v.lower().startswith(sort_pattern.lower()): # bingo!
bingo = 1
val = v
break
if not bingo: # sort_pattern not present, so add other vals after spaces
val = sort_pattern + " " + string.join(vals)
else:
# no sort pattern defined, so join them all together
val = string.join(vals)
val = strip_accents(val.lower()) # sort values regardless of accents and case
if recIDs_dict.has_key(val):
recIDs_dict[val].append(recID)
else:
recIDs_dict[val] = [recID]
# sort them:
recIDs_dict_keys = recIDs_dict.keys()
recIDs_dict_keys.sort()
# now that keys are sorted, create output array:
for k in recIDs_dict_keys:
for s in recIDs_dict[k]:
recIDs_out.append(s)
# ascending or descending?
if sort_order == 'a':
recIDs_out.reverse()
# okay, we are done
return recIDs_out
else:
# good, no sort needed
return recIDs
def print_records(req, recIDs, jrec=1, rg=10, format='hb', ot='', ln=cdslang, relevances=[], relevances_prologue="(", relevances_epilogue="%%)", decompress=zlib.decompress, search_pattern='', print_records_prologue_p=True, print_records_epilogue_p=True, verbose=0, tab=''):
"""
- Prints list of records 'recIDs' formatted accoding to 'format' in
+ Prints list of records 'recIDs' formatted according to 'format' in
groups of 'rg' starting from 'jrec'.
Assumes that the input list 'recIDs' is sorted in reverse order,
so it counts records from tail to head.
A value of 'rg=-9999' means to print all records: to be used with care.
Print also list of RELEVANCES for each record (if defined), in
between RELEVANCE_PROLOGUE and RELEVANCE_EPILOGUE.
Print prologue and/or epilogue specific to 'format' if
'print_records_prologue_p' and/or print_records_epilogue_p' are
True.
"""
# load the right message language
_ = gettext_set_language(ln)
# sanity checking:
if req is None:
return
- # get user id (for formatting based on priviledge)
- uid = getUid(req)
+ # get user_info (for formatting based on user)
+ user_info = collect_user_info(req)
if len(recIDs):
nb_found = len(recIDs)
if rg == -9999: # print all records
rg = nb_found
else:
rg = abs(rg)
if jrec < 1: # sanity checks
jrec = 1
if jrec > nb_found:
jrec = max(nb_found-rg+1, 1)
# will print records from irec_max to irec_min excluded:
irec_max = nb_found - jrec
irec_min = nb_found - jrec - rg
if irec_min < 0:
irec_min = -1
if irec_max >= nb_found:
irec_max = nb_found - 1
#req.write("%s:%d-%d" % (recIDs, irec_min, irec_max))
if format.startswith('x'):
# print header if needed
if print_records_prologue_p:
print_records_prologue(req, format)
# print records
recIDs_to_print = [recIDs[x] for x in range(irec_max, irec_min, -1)]
format_records(recIDs_to_print,
format,
ln=ln,
search_pattern=search_pattern,
record_separator="\n",
- uid=uid,
+ user_info=user_info,
req=req)
# print footer if needed
if print_records_epilogue_p:
print_records_epilogue(req, format)
elif format.startswith('t') or str(format[0:3]).isdigit():
# we are doing plain text output:
for irec in range(irec_max, irec_min, -1):
x = print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern,
- uid=uid, verbose=verbose)
+ user_info=user_info, verbose=verbose)
req.write(x)
if x:
req.write('\n')
elif format == 'excel':
recIDs_to_print = [recIDs[x] for x in range(irec_max, irec_min, -1)]
create_excel(recIDs=recIDs_to_print, req=req, ln=ln)
else:
# we are doing HTML output:
if format == 'hp' or format.startswith("hb_") or format.startswith("hd_"):
# portfolio and on-the-fly formats:
for irec in range(irec_max, irec_min, -1):
req.write(print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern,
- uid=uid, verbose=verbose))
+ user_info=user_info, verbose=verbose))
elif format.startswith("hb"):
# HTML brief format:
req.write(websearch_templates.tmpl_record_format_htmlbrief_header(
ln = ln))
for irec in range(irec_max, irec_min, -1):
row_number = jrec+irec_max-irec
recid = recIDs[irec]
if relevances and relevances[irec]:
relevance = relevances[irec]
else:
relevance = ''
record = print_record(recIDs[irec], format, ot, ln, search_pattern=search_pattern,
- uid=uid, verbose=verbose)
+ user_info=user_info, verbose=verbose)
req.write(websearch_templates.tmpl_record_format_htmlbrief_body(
ln = ln,
recid = recid,
row_number = row_number,
relevance = relevance,
record = record,
relevances_prologue = relevances_prologue,
relevances_epilogue = relevances_epilogue,
))
req.write(websearch_templates.tmpl_record_format_htmlbrief_footer(
ln = ln))
elif format.startswith("hd"):
# HTML detailed format:
for irec in range(irec_max, irec_min, -1):
unordered_tabs = get_detailed_page_tabs(get_colID(guess_primary_collection_of_a_record(recIDs[irec])),
recIDs[irec], ln=ln)
ordered_tabs_id = [(tab_id, values['order']) for (tab_id, values) in unordered_tabs.iteritems()]
ordered_tabs_id.sort(lambda x,y: cmp(x[1],y[1]))
link_ln = ''
if ln != cdslang:
link_ln = '?ln=%s' % ln
tabs = [(unordered_tabs[tab_id]['label'], \
'%s/record/%s/%s%s' % (weburl, recIDs[irec], tab_id, link_ln), \
tab_id == tab,
unordered_tabs[tab_id]['enabled']) \
for (tab_id, order) in ordered_tabs_id
if unordered_tabs[tab_id]['visible'] == True]
content = ''
# load content
if tab == 'usage':
r = calculate_reading_similarity_list(recIDs[irec], "downloads")
downloadsimilarity = None
downloadhistory = None
#if r:
# downloadsimilarity = r
if CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS:
downloadhistory = create_download_history_graph_and_box(recIDs[irec], ln)
r = calculate_reading_similarity_list(recIDs[irec], "pageviews")
viewsimilarity = None
if r: viewsimilarity = r
content = websearch_templates.tmpl_detailed_record_statistics(recIDs[irec],
ln,
downloadsimilarity=downloadsimilarity,
downloadhistory=downloadhistory,
viewsimilarity=viewsimilarity)
req.write(webstyle_templates.detailed_record_container(content,
recIDs[irec],
tabs,
ln))
elif tab == 'citations':
citinglist = []
citationhistory = None
recid = recIDs[irec]
selfcited = get_self_cited_by(recid)
r = calculate_cited_by_list(recid)
if r:
citinglist = r
citationhistory = create_citation_history_graph_and_box(recid, ln)
r = calculate_co_cited_with_list(recid)
cociting = None
if r:
cociting = r
content = websearch_templates.tmpl_detailed_record_citations(recid,
ln,
citinglist=citinglist,
citationhistory=citationhistory,
cociting=cociting,
selfcited=selfcited)
req.write(webstyle_templates.detailed_record_container(content,
recid,
tabs,
ln))
elif tab == 'references':
- content = format_record(recIDs[irec], 'HDREF', ln=ln, uid=uid, verbose=verbose)
+ content = format_record(recIDs[irec], 'HDREF', ln=ln, user_info=user_info, verbose=verbose)
req.write(webstyle_templates.detailed_record_container(content,
recIDs[irec],
tabs,
ln))
else:
# Metadata tab
content = print_record(recIDs[irec], format, ot, ln,
search_pattern=search_pattern,
- uid=uid, verbose=verbose)
+ user_info=user_info, verbose=verbose)
creationdate = None
modifydate = None
if record_exists(recIDs[irec]) == 1:
creationdate = get_creation_date(recIDs[irec])
modifydate = get_modification_date(recIDs[irec])
content = websearch_templates.tmpl_detailed_record_metadata(
recID = recIDs[irec],
ln = ln,
format = format,
creationdate = creationdate,
modifydate = modifydate,
content = content)
req.write(webstyle_templates.detailed_record_container(content,
recIDs[irec],
tabs,
ln=ln,
creationdate=creationdate,
modifydate=modifydate,
show_short_rec_p=False))
if len(tabs) > 0:
# Add the mini box at bottom of the page
if CFG_WEBCOMMENT_ALLOW_REVIEWS:
from invenio.webcomment import get_mini_reviews
reviews = get_mini_reviews(recid = recIDs[irec], ln=ln)
else:
reviews = ''
- actions = format_record(recIDs[irec], 'HDACT', ln=ln, uid=uid, verbose=verbose)
- files = format_record(recIDs[irec], 'HDFILE', ln=ln, uid=uid, verbose=verbose)
+ actions = format_record(recIDs[irec], 'HDACT', ln=ln, user_info=user_info, verbose=verbose)
+ files = format_record(recIDs[irec], 'HDFILE', ln=ln, user_info=user_info, verbose=verbose)
req.write(webstyle_templates.detailed_record_mini_panel(recIDs[irec],
ln,
format,
files=files,
reviews=reviews,
actions=actions))
else:
# Other formats
for irec in range(irec_max, irec_min, -1):
req.write(print_record(recIDs[irec], format, ot, ln,
search_pattern=search_pattern,
- uid=uid, verbose=verbose))
+ user_info=user_info, verbose=verbose))
else:
print_warning(req, _("Use different search terms."))
def print_records_prologue(req, format):
"""
Print the appropriate prologue for list of records in the given
format.
"""
prologue = "" # no prologue needed for HTML or Text formats
if format.startswith('xm'):
prologue = websearch_templates.tmpl_xml_marc_prologue()
elif format.startswith('xn'):
prologue = websearch_templates.tmpl_xml_nlm_prologue()
elif format.startswith('xr'):
prologue = websearch_templates.tmpl_xml_rss_prologue()
elif format.startswith('x'):
prologue = websearch_templates.tmpl_xml_default_prologue()
req.write(prologue)
def print_records_epilogue(req, format):
"""
Print the appropriate epilogue for list of records in the given
format.
"""
epilogue = "" # no epilogue needed for HTML or Text formats
if format.startswith('xm'):
epilogue = websearch_templates.tmpl_xml_marc_epilogue()
elif format.startswith('xn'):
epilogue = websearch_templates.tmpl_xml_nlm_epilogue()
elif format.startswith('xr'):
epilogue = websearch_templates.tmpl_xml_rss_epilogue()
elif format.startswith('x'):
epilogue = websearch_templates.tmpl_xml_default_epilogue()
req.write(epilogue)
def print_record(recID, format='hb', ot='', ln=cdslang, decompress=zlib.decompress,
- search_pattern=None, uid=None, verbose=0):
+ search_pattern=None, user_info=None, verbose=0):
"""Prints record 'recID' formatted accoding to 'format'."""
_ = gettext_set_language(ln)
out = ""
# sanity check:
record_exist_p = record_exists(recID)
if record_exist_p == 0: # doesn't exist
return out
# New Python BibFormat procedure for formatting
# Old procedure follows further below
# We must still check some special formats, but these
# should disappear when BibFormat improves.
if not (CFG_BIBFORMAT_USE_OLD_BIBFORMAT \
or format.lower().startswith('t') \
or format.lower().startswith('hm') \
or str(format[0:3]).isdigit() \
or ot):
# Unspecified format is hd
if format == '':
format = 'hd'
if record_exist_p == -1 and get_output_format_content_type(format) == 'text/html':
# HTML output displays a default value for deleted records.
# Other format have to deal with it.
out += _("The record has been deleted.")
else:
out += call_bibformat(recID, format, ln, search_pattern=search_pattern,
- uid=uid, verbose=verbose)
+ user_info=user_info, verbose=verbose)
# at the end of HTML brief mode, print the "Detailed record" functionality:
if format.lower().startswith('hb') and \
format.lower() != 'hb_p':
out += websearch_templates.tmpl_print_record_brief_links(
ln = ln,
recID = recID,
weburl = weburl
)
return out
# Old PHP BibFormat procedure for formatting
# print record opening tags, if needed:
if format == "marcxml" or format == "oai_dc":
out += " \n"
out += " \n"
for oai_id in get_fieldvalues(recID, CFG_OAI_ID_FIELD):
out += " %s\n" % oai_id
out += " %s\n" % get_modification_date(recID)
out += " \n"
out += " \n"
if format.startswith("xm") or format == "marcxml":
# look for detailed format existence:
query = "SELECT value FROM bibfmt WHERE id_bibrec='%s' AND format='%s'" % (recID, format)
res = run_sql(query, None, 1)
if res and record_exist_p == 1:
# record 'recID' is formatted in 'format', so print it
out += "%s" % decompress(res[0][0])
else:
# record 'recID' is not formatted in 'format' -- they are not in "bibfmt" table; so fetch all the data from "bibXXx" tables:
if format == "marcxml":
out += """ \n"""
out += " %d\n" % int(recID)
elif format.startswith("xm"):
out += """ \n"""
out += " %d\n" % int(recID)
if record_exist_p == -1:
# deleted record, so display only OAI ID and 980:
oai_ids = get_fieldvalues(recID, CFG_OAI_ID_FIELD)
if oai_ids:
out += "%s\n" % \
(CFG_OAI_ID_FIELD[0:3], CFG_OAI_ID_FIELD[3:4], CFG_OAI_ID_FIELD[4:5], CFG_OAI_ID_FIELD[5:6], oai_ids[0])
out += "DELETED\n"
else:
# controlfields
query = "SELECT b.tag,b.value,bb.field_number FROM bib00x AS b, bibrec_bib00x AS bb "\
"WHERE bb.id_bibrec='%s' AND b.id=bb.id_bibxxx AND b.tag LIKE '00%%' "\
"ORDER BY bb.field_number, b.tag ASC" % recID
res = run_sql(query)
for row in res:
field, value = row[0], row[1]
value = encode_for_xml(value)
out += """ %s\n""" % \
(encode_for_xml(field[0:3]), value)
# datafields
i = 1 # Do not process bib00x and bibrec_bib00x, as
# they are controlfields. So start at bib01x and
# bibrec_bib00x (and set i = 0 at the end of
# first loop)
for digit1 in range(0, 10):
for digit2 in range(i, 10):
bx = "bib%d%dx" % (digit1, digit2)
bibx = "bibrec_bib%d%dx" % (digit1, digit2)
query = "SELECT b.tag,b.value,bb.field_number FROM %s AS b, %s AS bb "\
"WHERE bb.id_bibrec='%s' AND b.id=bb.id_bibxxx AND b.tag LIKE '%s%%' "\
"ORDER BY bb.field_number, b.tag ASC" % (bx, bibx, recID, str(digit1)+str(digit2))
res = run_sql(query)
field_number_old = -999
field_old = ""
for row in res:
field, value, field_number = row[0], row[1], row[2]
ind1, ind2 = field[3], field[4]
if ind1 == "_" or ind1 == "":
ind1 = " "
if ind2 == "_" or ind2 == "":
ind2 = " "
# print field tag
if field_number != field_number_old or field[:-1] != field_old[:-1]:
if field_number_old != -999:
out += """ \n"""
out += """ \n""" % \
(encode_for_xml(field[0:3]), encode_for_xml(ind1), encode_for_xml(ind2))
field_number_old = field_number
field_old = field
# print subfield value
value = encode_for_xml(value)
out += """ %s\n""" % \
(encode_for_xml(field[-1:]), value)
# all fields/subfields printed in this run, so close the tag:
if field_number_old != -999:
out += """ \n"""
i = 0 # Next loop should start looking at bib%0 and bibrec_bib00x
# we are at the end of printing the record:
out += " \n"
elif format == "xd" or format == "oai_dc":
# XML Dublin Core format, possibly OAI -- select only some bibXXx fields:
out += """ \n"""
if record_exist_p == -1:
out += ""
else:
for f in get_fieldvalues(recID, "041__a"):
out += " %s\n" % f
for f in get_fieldvalues(recID, "100__a"):
out += " %s\n" % encode_for_xml(f)
for f in get_fieldvalues(recID, "700__a"):
out += " %s\n" % encode_for_xml(f)
for f in get_fieldvalues(recID, "245__a"):
out += " %s\n" % encode_for_xml(f)
for f in get_fieldvalues(recID, "65017a"):
out += " %s\n" % encode_for_xml(f)
for f in get_fieldvalues(recID, "8564_u"):
out += " %s\n" % encode_for_xml(f)
for f in get_fieldvalues(recID, "520__a"):
out += " %s\n" % encode_for_xml(f)
out += " %s\n" % get_creation_date(recID)
out += " \n"
elif str(format[0:3]).isdigit():
# user has asked to print some fields only
if format == "001":
out += "%s\n" % (format, recID, format)
else:
vals = get_fieldvalues(recID, format)
for val in vals:
out += "%s\n" % (format, val, format)
elif format.startswith('t'):
## user directly asked for some tags to be displayed only
if record_exist_p == -1:
out += get_fieldvalues_alephseq_like(recID, ["001", CFG_OAI_ID_FIELD, "980"])
else:
out += get_fieldvalues_alephseq_like(recID, ot)
elif format == "hm":
if record_exist_p == -1:
out += "
"
elif format == "hd":
# HTML detailed format
if record_exist_p == -1:
out += _("The record has been deleted.")
else:
# look for detailed format existence:
query = "SELECT value FROM bibfmt WHERE id_bibrec='%s' AND format='%s'" % (recID, format)
res = run_sql(query, None, 1)
if res:
# record 'recID' is formatted in 'format', so print it
out += "%s" % decompress(res[0][0])
else:
# record 'recID' is not formatted in 'format', so try to call BibFormat on the fly or use default format:
out_record_in_format = call_bibformat(recID, format, ln, search_pattern=search_pattern,
- uid=uid, verbose=verbose)
+ user_info=user_info, verbose=verbose)
if out_record_in_format:
out += out_record_in_format
else:
out += websearch_templates.tmpl_print_record_detailed(
ln = ln,
recID = recID,
weburl = weburl,
)
elif format.startswith("hb_") or format.startswith("hd_"):
# underscore means that HTML brief/detailed formats should be called on-the-fly; suitable for testing formats
if record_exist_p == -1:
out += _("The record has been deleted.")
else:
out += call_bibformat(recID, format, ln, search_pattern=search_pattern,
- uid=uid, verbose=verbose)
+ user_info=user_info, verbose=verbose)
elif format.startswith("hx"):
# BibTeX format, called on the fly:
if record_exist_p == -1:
out += _("The record has been deleted.")
else:
out += call_bibformat(recID, format, ln, search_pattern=search_pattern,
- uid=uid, verbose=verbose)
+ user_info=user_info, verbose=verbose)
elif format.startswith("hs"):
# for citation/download similarity navigation links:
if record_exist_p == -1:
out += _("The record has been deleted.")
else:
out += '' % websearch_templates.build_search_url(recid=recID, ln=ln)
# firstly, title:
titles = get_fieldvalues(recID, "245__a")
if titles:
for title in titles:
out += "%s" % title
else:
# usual title not found, try conference title:
titles = get_fieldvalues(recID, "111__a")
if titles:
for title in titles:
out += "%s" % title
else:
# just print record ID:
out += "%s %d" % (get_field_i18nname("record ID", ln), recID)
out += ""
# secondly, authors:
authors = get_fieldvalues(recID, "100__a") + get_fieldvalues(recID, "700__a")
if authors:
out += " - %s" % authors[0]
if len(authors) > 1:
out += " et al"
# thirdly publication info:
publinfos = get_fieldvalues(recID, "773__s")
if not publinfos:
publinfos = get_fieldvalues(recID, "909C4s")
if not publinfos:
publinfos = get_fieldvalues(recID, "037__a")
if not publinfos:
publinfos = get_fieldvalues(recID, "088__a")
if publinfos:
out += " - %s" % publinfos[0]
else:
# fourthly publication year (if not publication info):
years = get_fieldvalues(recID, "773__y")
if not years:
years = get_fieldvalues(recID, "909C4y")
if not years:
years = get_fieldvalues(recID, "260__c")
if years:
out += " (%s)" % years[0]
else:
# HTML brief format by default
if record_exist_p == -1:
out += _("The record has been deleted.")
else:
query = "SELECT value FROM bibfmt WHERE id_bibrec='%s' AND format='%s'" % (recID, format)
res = run_sql(query)
if res:
# record 'recID' is formatted in 'format', so print it
out += "%s" % decompress(res[0][0])
else:
# record 'recID' is not formatted in 'format', so try to call BibFormat on the fly: or use default format:
if CFG_WEBSEARCH_CALL_BIBFORMAT:
out_record_in_format = call_bibformat(recID, format, ln, search_pattern=search_pattern,
- uid=uid, verbose=verbose)
+ user_info=user_info, verbose=verbose)
if out_record_in_format:
out += out_record_in_format
else:
out += websearch_templates.tmpl_print_record_brief(
ln = ln,
recID = recID,
weburl = weburl,
)
else:
out += websearch_templates.tmpl_print_record_brief(
ln = ln,
recID = recID,
weburl = weburl,
)
# at the end of HTML brief mode, print the "Detailed record" functionality:
if format == 'hp' or format.startswith("hb_") or format.startswith("hd_"):
pass # do nothing for portfolio and on-the-fly formats
else:
out += websearch_templates.tmpl_print_record_brief_links(
ln = ln,
recID = recID,
weburl = weburl,
)
# print record closing tags, if needed:
if format == "marcxml" or format == "oai_dc":
out += " \n"
out += " \n"
return out
def encode_for_xml(s):
"Encode special chars in string so that it would be XML-compliant."
s = string.replace(s, '&', '&')
s = string.replace(s, '<', '<')
return s
-def call_bibformat(recID, format="HD", ln=cdslang, search_pattern=None, uid=None, verbose=0):
+def call_bibformat(recID, format="HD", ln=cdslang, search_pattern=None, user_info=None, verbose=0):
"""
Calls BibFormat and returns formatted record.
BibFormat will decide by itself if old or new BibFormat must be used.
"""
keywords = []
if search_pattern is not None:
units = create_basic_search_units(None, str(search_pattern), None)
keywords = [unit[1] for unit in units if unit[0] != '-']
return format_record(recID,
of=format,
ln=ln,
search_pattern=keywords,
- uid=uid,
+ user_info=user_info,
verbose=verbose)
def log_query(hostname, query_args, uid=-1):
"""
Log query into the query and user_query tables.
Return id_query or None in case of problems.
"""
id_query = None
if uid > 0:
# log the query only if uid is reasonable
res = run_sql("SELECT id FROM query WHERE urlargs=%s", (query_args,), 1)
try:
id_query = res[0][0]
except:
id_query = run_sql("INSERT INTO query (type, urlargs) VALUES ('r', %s)", (query_args,))
if id_query:
run_sql("INSERT INTO user_query (id_user, id_query, hostname, date) VALUES (%s, %s, %s, %s)",
(uid, id_query, hostname,
time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
return id_query
def log_query_info(action, p, f, colls, nb_records_found_total=-1):
"""Write some info to the log file for later analysis."""
try:
log = open(logdir + "/search.log", "a")
log.write(time.strftime("%Y%m%d%H%M%S#", time.localtime()))
log.write(action+"#")
log.write(p+"#")
log.write(f+"#")
for coll in colls[:-1]:
log.write("%s," % coll)
log.write("%s#" % colls[-1])
log.write("%d" % nb_records_found_total)
log.write("\n")
log.close()
except:
pass
return
def wash_url_argument(var, new_type):
"""Wash list argument into 'new_type', that can be 'list',
'str', or 'int'. Useful for washing mod_python passed
arguments, that are all lists of strings (URL args may be
multiple), but we sometimes want only to take the first value,
and sometimes to represent it as string or numerical value."""
out = []
if new_type == 'list': # return lst
if type(var) is list:
out = var
else:
out = [var]
elif new_type == 'str': # return str
if type(var) is list:
try:
out = "%s" % var[0]
except:
out = ""
elif type(var) is str:
out = var
else:
out = "%s" % var
elif new_type == 'int': # return int
if type(var) is list:
try:
out = string.atoi(var[0])
except:
out = 0
elif type(var) is int:
out = var
elif type(var) is str:
try:
out = string.atoi(var)
except:
out = 0
else:
out = 0
return out
### CALLABLES
def perform_request_search(req=None, cc=cdsname, c=None, p="", f="", rg=10, sf="", so="d", sp="", rm="", of="id", ot="", as=0,
p1="", f1="", m1="", op1="", p2="", f2="", m2="", op2="", p3="", f3="", m3="", sc=0, jrec=0,
recid=-1, recidb=-1, sysno="", id=-1, idb=-1, sysnb="", action="", d1="",
d1y=0, d1m=0, d1d=0, d2="", d2y=0, d2m=0, d2d=0, dt="", verbose=0, ap=0, ln=cdslang, ec=None, tab=""):
"""Perform search or browse request, without checking for
authentication. Return list of recIDs found, if of=id.
Otherwise create web page.
The arguments are as follows:
req - mod_python Request class instance.
cc - current collection (e.g. "ATLAS"). The collection the
user started to search/browse from.
c - collection list (e.g. ["Theses", "Books"]). The
collections user may have selected/deselected when
starting to search from 'cc'.
p - pattern to search for (e.g. "ellis and muon or kaon").
f - field to search within (e.g. "author").
rg - records in groups of (e.g. "10"). Defines how many hits
per collection in the search results page are
displayed.
sf - sort field (e.g. "title").
so - sort order ("a"=ascending, "d"=descending).
sp - sort pattern (e.g. "CERN-") -- in case there are more
values in a sort field, this argument tells which one
to prefer
rm - ranking method (e.g. "jif"). Defines whether results
should be ranked by some known ranking method.
of - output format (e.g. "hb"). Usually starting "h" means
HTML output (and "hb" for HTML brief, "hd" for HTML
detailed), "x" means XML output, "t" means plain text
output, "id" means no output at all but to return list
of recIDs found. (Suitable for high-level API.)
ot - output only these MARC tags (e.g. "100,700,909C0b").
Useful if only some fields are to be shown in the
output, e.g. for library to control some fields.
as - advanced search ("0" means no, "1" means yes). Whether
search was called from within the advanced search
interface.
p1 - first pattern to search for in the advanced search
interface. Much like 'p'.
f1 - first field to search within in the advanced search
interface. Much like 'f'.
m1 - first matching type in the advanced search interface.
("a" all of the words, "o" any of the words, "e" exact
phrase, "p" partial phrase, "r" regular expression).
op1 - first operator, to join the first and the second unit
in the advanced search interface. ("a" add, "o" or,
"n" not).
p2 - second pattern to search for in the advanced search
interface. Much like 'p'.
f2 - second field to search within in the advanced search
interface. Much like 'f'.
m2 - second matching type in the advanced search interface.
("a" all of the words, "o" any of the words, "e" exact
phrase, "p" partial phrase, "r" regular expression).
op2 - second operator, to join the second and the third unit
in the advanced search interface. ("a" add, "o" or,
"n" not).
p3 - third pattern to search for in the advanced search
interface. Much like 'p'.
f3 - third field to search within in the advanced search
interface. Much like 'f'.
m3 - third matching type in the advanced search interface.
("a" all of the words, "o" any of the words, "e" exact
phrase, "p" partial phrase, "r" regular expression).
sc - split by collection ("0" no, "1" yes). Governs whether
we want to present the results in a single huge list,
or splitted by collection.
jrec - jump to record (e.g. "234"). Used for navigation
inside the search results.
recid - display record ID (e.g. "20000"). Do not
search/browse but go straight away to the Detailed
record page for the given recID.
recidb - display record ID bis (e.g. "20010"). If greater than
'recid', then display records from recid to recidb.
Useful for example for dumping records from the
database for reformatting.
sysno - display old system SYS number (e.g. ""). If you
migrate to CDS Invenio from another system, and store your
old SYS call numbers, you can use them instead of recid
if you wish so.
id - the same as recid, in case recid is not set. For
backwards compatibility.
idb - the same as recid, in case recidb is not set. For
backwards compatibility.
sysnb - the same as sysno, in case sysno is not set. For
backwards compatibility.
action - action to do. "SEARCH" for searching, "Browse" for
browsing. Default is to search.
d1 - first datetime in full YYYY-mm-dd HH:MM:DD format
(e.g. "1998-08-23 12:34:56"). Useful for search limits
on creation/modification date (see 'dt' argument
below). Note that 'd1' takes precedence over d1y, d1m,
d1d if these are defined.
d1y - first date's year (e.g. "1998"). Useful for search
limits on creation/modification date.
d1m - first date's month (e.g. "08"). Useful for search
limits on creation/modification date.
d1d - first date's day (e.g. "23"). Useful for search
limits on creation/modification date.
d2 - second datetime in full YYYY-mm-dd HH:MM:DD format
(e.g. "1998-09-02 12:34:56"). Useful for search limits
on creation/modification date (see 'dt' argument
below). Note that 'd2' takes precedence over d2y, d2m,
d2d if these are defined.
d2y - second date's year (e.g. "1998"). Useful for search
limits on creation/modification date.
d2m - second date's month (e.g. "09"). Useful for search
limits on creation/modification date.
d2d - second date's day (e.g. "02"). Useful for search
limits on creation/modification date.
dt - first and second date's type (e.g. "c"). Specifies
whether to search in creation dates ("c") or in
modification dates ("m"). When dt is not set and d1*
and d2* are set, the default is "c".
verbose - verbose level (0=min, 9=max). Useful to print some
internal information on the searching process in case
something goes wrong.
ap - alternative patterns (0=no, 1=yes). In case no exact
match is found, the search engine can try alternative
patterns e.g. to replace non-alphanumeric characters by
a boolean query. ap defines if this is wanted.
ln - language of the search interface (e.g. "en"). Useful
for internationalization.
ec - list of external search engines to search as well
(e.g. "SPIRES HEP").
"""
selected_external_collections_infos = None
# wash all arguments requiring special care
try:
(cc, colls_to_display, colls_to_search) = wash_colls(cc, c, sc) # which colls to search and to display?
except InvenioWebSearchUnknownCollectionError, exc:
colname = exc.colname
if of.startswith("h"):
page_start(req, of, cc, as, ln, getUid(req),
websearch_templates.tmpl_collection_not_found_page_title(colname, ln))
req.write(websearch_templates.tmpl_collection_not_found_page_body(colname, ln))
return page_end(req, of, ln)
elif of == "id":
return []
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
else:
return page_end(req, of, ln)
p = wash_pattern(p)
f = wash_field(f)
p1 = wash_pattern(p1)
f1 = wash_field(f1)
p2 = wash_pattern(p2)
f2 = wash_field(f2)
p3 = wash_pattern(p3)
f3 = wash_field(f3)
datetext1, datetext2 = wash_dates(d1, d1y, d1m, d1d, d2, d2y, d2m, d2d)
_ = gettext_set_language(ln)
# backwards compatibility: id, idb, sysnb -> recid, recidb, sysno (if applicable)
if sysnb != "" and sysno == "":
sysno = sysnb
if id > 0 and recid == -1:
recid = id
if idb > 0 and recidb == -1:
recidb = idb
# TODO deduce passed search limiting criterias (if applicable)
pl, pl_in_url = "", "" # no limits by default
if action != "browse" and req and req.args: # we do not want to add options while browsing or while calling via command-line
fieldargs = cgi.parse_qs(req.args)
for fieldcode in get_fieldcodes():
if fieldargs.has_key(fieldcode):
for val in fieldargs[fieldcode]:
pl += "+%s:\"%s\" " % (fieldcode, val)
pl_in_url += "&%s=%s" % (urllib.quote(fieldcode), urllib.quote(val))
# deduce recid from sysno argument (if applicable):
if sysno: # ALEPH SYS number was passed, so deduce DB recID for the record:
recid = get_mysql_recid_from_aleph_sysno(sysno)
# deduce collection we are in (if applicable):
if recid > 0:
cc = guess_primary_collection_of_a_record(recid)
# deduce user id (if applicable):
try:
uid = getUid(req)
except:
uid = 0
## 0 - start output
if recid > 0:
## 1 - detailed record display
title, description, keywords = \
websearch_templates.tmpl_record_page_header_content(req, recid, ln)
page_start(req, of, cc, as, ln, uid, title, description, keywords, recid, tab)
# Default format is hb but we are in detailed -> change 'of'
if of == "hb":
of = "hd"
if record_exists(recid):
if recidb <= recid: # sanity check
recidb = recid + 1
if of == "id":
return [recidx for recidx in range(recid, recidb) if record_exists(recidx)]
else:
print_records(req, range(recid, recidb), -1, -9999, of, ot, ln, search_pattern=p, verbose=verbose, tab=tab)
if req and of.startswith("h"): # register detailed record page view event
client_ip_address = str(req.get_remote_host(apache.REMOTE_NOLOOKUP))
register_page_view_event(recid, uid, client_ip_address)
else: # record does not exist
if of == "id":
return []
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
elif of.startswith("h"):
print_warning(req, "Requested record does not seem to exist.")
elif action == "browse":
## 2 - browse needed
page_start(req, of, cc, as, ln, uid, _("Browse"))
if of.startswith("h"):
req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, as, ln, p1, f1, m1, op1,
p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action))
try:
if as == 1 or (p1 or p2 or p3):
browse_pattern(req, colls_to_search, p1, f1, rg, ln)
browse_pattern(req, colls_to_search, p2, f2, rg, ln)
browse_pattern(req, colls_to_search, p3, f3, rg, ln)
else:
browse_pattern(req, colls_to_search, p, f, rg, ln)
except:
if of.startswith("h"):
req.write(create_error_box(req, verbose=verbose, ln=ln))
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
return page_end(req, of, ln)
elif rm and p.startswith("recid:"):
## 3-ter - similarity search needed
page_start(req, of, cc, as, ln, uid, _("Search Results"))
if of.startswith("h"):
req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, as, ln, p1, f1, m1, op1,
p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action))
if record_exists(p[6:]) != 1:
# record does not exist
if of.startswith("h"):
print_warning(req, "Requested record does not seem to exist.")
if of == "id":
return []
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
else:
# record well exists, so find similar ones to it
t1 = os.times()[4]
results_similar_recIDs, results_similar_relevances, results_similar_relevances_prologue, results_similar_relevances_epilogue, results_similar_comments = \
rank_records(rm, 0, get_collection_reclist(cdsname), string.split(p), verbose)
if results_similar_recIDs:
t2 = os.times()[4]
cpu_time = t2 - t1
if of.startswith("h"):
req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, cdsname, len(results_similar_recIDs),
jrec, rg, as, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2,
sc, pl_in_url,
d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time))
print_warning(req, results_similar_comments)
print_records(req, results_similar_recIDs, jrec, rg, of, ot, ln,
results_similar_relevances, results_similar_relevances_prologue, results_similar_relevances_epilogue, search_pattern=p, verbose=verbose)
elif of=="id":
return results_similar_recIDs
elif of.startswith("x"):
print_records(req, results_similar_recIDs, jrec, rg, of, ot, ln,
results_similar_relevances, results_similar_relevances_prologue, results_similar_relevances_epilogue, search_pattern=p, verbose=verbose)
else:
# rank_records failed and returned some error message to display:
if of.startswith("h"):
print_warning(req, results_similar_relevances_prologue)
print_warning(req, results_similar_relevances_epilogue)
print_warning(req, results_similar_comments)
if of == "id":
return []
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
elif p.startswith("cocitedwith:"): #WAS EXPERIMENTAL
## 3-terter - cited by search needed
page_start(req, of, cc, as, ln, uid, _("Search Results"))
if of.startswith("h"):
req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, as, ln, p1, f1, m1, op1,
p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action))
recID = p[12:]
if record_exists(recID) != 1:
# record does not exist
if of.startswith("h"):
print_warning(req, "Requested record does not seem to exist.")
if of == "id":
return []
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
else:
# record well exists, so find co-cited ones:
t1 = os.times()[4]
results_cocited_recIDs = map(lambda x: x[0], calculate_co_cited_with_list(int(recID)))
if results_cocited_recIDs:
t2 = os.times()[4]
cpu_time = t2 - t1
if of.startswith("h"):
req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, cdsname, len(results_cocited_recIDs),
jrec, rg, as, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2,
sc, pl_in_url,
d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time))
print_records(req, results_cocited_recIDs, jrec, rg, of, ot, ln, search_pattern=p, verbose=verbose)
elif of=="id":
return results_cocited_recIDs
elif of.startswith("x"):
print_records(req, results_cocited_recIDs, jrec, rg, of, ot, ln, search_pattern=p, verbose=verbose)
else:
# cited rank_records failed and returned some error message to display:
if of.startswith("h"):
print_warning(req, "nothing found")
if of == "id":
return []
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
else:
## 3 - common search needed
page_start(req, of, cc, as, ln, uid, _("Search Results"))
if of.startswith("h"):
req.write(create_search_box(cc, colls_to_display, p, f, rg, sf, so, sp, rm, of, ot, as, ln, p1, f1, m1, op1,
p2, f2, m2, op2, p3, f3, m3, sc, pl, d1y, d1m, d1d, d2y, d2m, d2d, dt, jrec, ec, action))
t1 = os.times()[4]
results_in_any_collection = HitSet()
if as == 1 or (p1 or p2 or p3):
## 3A - advanced search
try:
results_in_any_collection = search_pattern(req, p1, f1, m1, ap=ap, of=of, verbose=verbose, ln=ln)
if len(results_in_any_collection) == 0:
if of.startswith("h"):
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
return page_end(req, of, ln)
if p2:
results_tmp = search_pattern(req, p2, f2, m2, ap=ap, of=of, verbose=verbose, ln=ln)
if op1 == "a": # add
results_in_any_collection.intersection_update(results_tmp)
elif op1 == "o": # or
results_in_any_collection.union_update(results_tmp)
elif op1 == "n": # not
results_in_any_collection.difference_update(results_tmp)
else:
if of.startswith("h"):
print_warning(req, "Invalid set operation %s." % op1, "Error")
if len(results_in_any_collection) == 0:
if of.startswith("h"):
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
return page_end(req, of, ln)
if p3:
results_tmp = search_pattern(req, p3, f3, m3, ap=ap, of=of, verbose=verbose, ln=ln)
if op2 == "a": # add
results_in_any_collection.intersection_update(results_tmp)
elif op2 == "o": # or
results_in_any_collection.union_update(results_tmp)
elif op2 == "n": # not
results_in_any_collection.difference_update(results_tmp)
else:
if of.startswith("h"):
print_warning(req, "Invalid set operation %s." % op2, "Error")
except:
if of.startswith("h"):
req.write(create_error_box(req, verbose=verbose, ln=ln))
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
return page_end(req, of, ln)
else:
## 3B - simple search
try:
results_in_any_collection = search_pattern(req, p, f, ap=ap, of=of, verbose=verbose, ln=ln)
except:
if of.startswith("h"):
req.write(create_error_box(req, verbose=verbose, ln=ln))
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
return page_end(req, of, ln)
if len(results_in_any_collection) == 0:
if of.startswith("h"):
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
return page_end(req, of, ln)
# search_cache_key = p+"@"+f+"@"+string.join(colls_to_search,",")
# if search_cache.has_key(search_cache_key): # is the result in search cache?
# results_final = search_cache[search_cache_key]
# else:
# results_final = search_pattern(req, p, f, colls_to_search)
# search_cache[search_cache_key] = results_final
# if len(search_cache) > CFG_WEBSEARCH_SEARCH_CACHE_SIZE: # is the cache full? (sanity cleaning)
# search_cache.clear()
# search stage 4: intersection with collection universe:
try:
results_final = intersect_results_with_collrecs(req, results_in_any_collection, colls_to_search, ap, of, verbose, ln)
except:
if of.startswith("h"):
req.write(create_error_box(req, verbose=verbose, ln=ln))
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
return page_end(req, of, ln)
if results_final == {}:
if of.startswith("h"):
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
if of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
return page_end(req, of, ln)
# search stage 5: apply search option limits and restrictions:
if datetext1 != "":
if verbose and of.startswith("h"):
print_warning(req, "Search stage 5: applying time limits, from %s until %s..." % (datetext1, datetext2))
try:
results_final = intersect_results_with_hitset(req,
results_final,
search_unit_in_bibrec(datetext1, datetext2, dt),
ap,
aptext= _("No match within your time limits, "
"discarding this condition..."),
of=of)
except:
if of.startswith("h"):
req.write(create_error_box(req, verbose=verbose, ln=ln))
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
return page_end(req, of, ln)
if results_final == {}:
if of.startswith("h"):
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
return page_end(req, of, ln)
if pl:
pl = wash_pattern(pl)
if verbose and of.startswith("h"):
print_warning(req, "Search stage 5: applying search pattern limit %s..." % (pl,))
try:
results_final = intersect_results_with_hitset(req,
results_final,
search_pattern(req, pl, ap=0, ln=ln),
ap,
aptext=_("No match within your search limits, "
"discarding this condition..."),
of=of)
except:
if of.startswith("h"):
req.write(create_error_box(req, verbose=verbose, ln=ln))
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
return page_end(req, of, ln)
if results_final == {}:
if of.startswith("h"):
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
if of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
return page_end(req, of, ln)
t2 = os.times()[4]
cpu_time = t2 - t1
## search stage 6: display results:
results_final_nb_total = 0
results_final_nb = {} # will hold number of records found in each collection
# (in simple dict to display overview more easily)
for coll in results_final.keys():
results_final_nb[coll] = len(results_final[coll])
#results_final_nb_total += results_final_nb[coll]
# Now let us calculate results_final_nb_total more precisely,
# in order to get the total number of "distinct" hits across
# searched collections; this is useful because a record might
# have been attributed to more than one primary collection; so
# we have to avoid counting it multiple times. The price to
# pay for this accuracy of results_final_nb_total is somewhat
# increased CPU time.
if results_final.keys() == 1:
# only one collection; no need to union them
results_final_for_all_selected_colls = results_final.values()[0]
results_final_nb_total = results_final_nb.values()[0]
else:
# okay, some work ahead to union hits across collections:
results_final_for_all_selected_colls = HitSet()
for coll in results_final.keys():
results_final_for_all_selected_colls.union_update(results_final[coll])
results_final_nb_total = len(results_final_for_all_selected_colls)
if results_final_nb_total == 0:
if of.startswith('h'):
print_warning(req, "No match found, please enter different search terms.")
elif of.startswith("x"):
# Print empty, but valid XML
print_records_prologue(req, of)
print_records_epilogue(req, of)
else:
# yes, some hits found: good!
# collection list may have changed due to not-exact-match-found policy so check it out:
for coll in results_final.keys():
if coll not in colls_to_search:
colls_to_search.append(coll)
# print results overview:
if of == "id":
# we have been asked to return list of recIDs
recIDs = list(results_final_for_all_selected_colls)
if sf: # do we have to sort?
recIDs = sort_records(req, recIDs, sf, so, sp, verbose, of)
elif rm: # do we have to rank?
results_final_for_all_colls_rank_records_output = rank_records(rm, 0, results_final_for_all_selected_colls,
string.split(p) + string.split(p1) +
string.split(p2) + string.split(p3), verbose)
if results_final_for_all_colls_rank_records_output[0]:
recIDs = results_final_for_all_colls_rank_records_output[0]
return recIDs
elif of.startswith("h"):
req.write(print_results_overview(req, colls_to_search, results_final_nb_total, results_final_nb, cpu_time, ln, ec))
selected_external_collections_infos = print_external_results_overview(req, cc, [p, p1, p2, p3], f, ec, verbose, ln)
# print number of hits found for XML outputs:
if of.startswith("x"):
req.write("\n" % results_final_nb_total)
# print records:
if len(colls_to_search)>1:
cpu_time = -1 # we do not want to have search time printed on each collection
print_records_prologue(req, of)
for coll in colls_to_search:
if results_final.has_key(coll) and len(results_final[coll]):
if of.startswith("h"):
req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, coll, results_final_nb[coll],
jrec, rg, as, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2,
sc, pl_in_url,
d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time))
results_final_recIDs = list(results_final[coll])
results_final_relevances = []
results_final_relevances_prologue = ""
results_final_relevances_epilogue = ""
if sf: # do we have to sort?
results_final_recIDs = sort_records(req, results_final_recIDs, sf, so, sp, verbose, of)
elif rm: # do we have to rank?
results_final_recIDs_ranked, results_final_relevances, results_final_relevances_prologue, results_final_relevances_epilogue, results_final_comments = \
rank_records(rm, 0, results_final[coll],
string.split(p) + string.split(p1) +
string.split(p2) + string.split(p3), verbose)
if of.startswith("h"):
print_warning(req, results_final_comments)
if results_final_recIDs_ranked:
results_final_recIDs = results_final_recIDs_ranked
else:
# rank_records failed and returned some error message to display:
print_warning(req, results_final_relevances_prologue)
print_warning(req, results_final_relevances_epilogue)
print_records(req, results_final_recIDs, jrec, rg, of, ot, ln,
results_final_relevances,
results_final_relevances_prologue,
results_final_relevances_epilogue,
search_pattern=p,
print_records_prologue_p=False,
print_records_epilogue_p=False,
verbose=verbose)
if of.startswith("h"):
req.write(print_search_info(p, f, sf, so, sp, rm, of, ot, coll, results_final_nb[coll],
jrec, rg, as, ln, p1, p2, p3, f1, f2, f3, m1, m2, m3, op1, op2,
sc, pl_in_url,
d1y, d1m, d1d, d2y, d2m, d2d, dt, cpu_time, 1))
print_records_epilogue(req, of)
if f == "author" and of.startswith("h"):
req.write(create_similarly_named_authors_link_box(p, ln))
# log query:
try:
id_query = log_query(req.get_remote_host(), req.args, uid)
if of.startswith("h") and id_query:
# Alert/RSS teaser:
req.write(websearch_templates.tmpl_alert_rss_teaser_box_for_query(id_query, ln=ln))
except:
# do not log query if req is None (used by CLI interface)
pass
log_query_info("ss", p, f, colls_to_search, results_final_nb_total)
# External searches
if of.startswith("h"):
perform_external_collection_search(req, cc, [p, p1, p2, p3], f, ec, verbose, ln, selected_external_collections_infos)
return page_end(req, of, ln)
def perform_request_cache(req, action="show"):
"""Manipulates the search engine cache."""
global search_cache
global collection_reclist_cache
global collection_reclist_cache_timestamp
global field_i18nname_cache
global field_i18nname_cache_timestamp
global collection_i18nname_cache
global collection_i18nname_cache_timestamp
req.content_type = "text/html"
req.send_http_header()
out = ""
out += "
Search Cache
"
# clear cache if requested:
if action == "clear":
search_cache = {}
collection_reclist_cache = create_collection_reclist_cache()
# show collection reclist cache:
out += "
Collection reclist cache
"
out += "- collection table last updated: %s" % get_table_update_time('collection')
out += " - reclist cache timestamp: %s" % collection_reclist_cache_timestamp
out += " - reclist cache contents:"
out += "
"
for coll in collection_reclist_cache.keys():
if collection_reclist_cache[coll]:
out += "%s (%d) " % (coll, len(get_collection_reclist(coll)))
out += "
"
# show search cache:
out += "
Search Cache
"
out += "
"
if len(search_cache):
out += """
"""
out += "
%s
%s
%s
%s
" % \
("Pattern", "Field", "Collection", "Number of Hits")
for search_cache_key in search_cache.keys():
p, f, c = string.split(search_cache_key, "@", 2)
# find out about length of cached data:
l = 0
for coll in search_cache[search_cache_key]:
l += len(search_cache[search_cache_key][coll])
out += "
%s
%s
%s
%d
" % (p, f, c, l)
out += "
"
else:
out += "
Search cache is empty."
out += "
"
out += """
clear cache""" % weburl
# show field i18nname cache:
out += "
Field I18N names cache
"
out += "- fieldname table last updated: %s" % get_table_update_time('fieldname')
out += " - i18nname cache timestamp: %s" % field_i18nname_cache_timestamp
out += " - i18nname cache contents:"
out += "
"
for field in field_i18nname_cache.keys():
for ln in field_i18nname_cache[field].keys():
out += "%s, %s = %s " % (field, ln, field_i18nname_cache[field][ln])
out += "
"
# show collection i18nname cache:
out += "
Collection I18N names cache
"
out += "- collectionname table last updated: %s" % get_table_update_time('collectionname')
out += " - i18nname cache timestamp: %s" % collection_i18nname_cache_timestamp
out += " - i18nname cache contents:"
out += "
"
for coll in collection_i18nname_cache.keys():
for ln in collection_i18nname_cache[coll].keys():
out += "%s, %s = %s " % (coll, ln, collection_i18nname_cache[coll][ln])
out += "
"
req.write("")
req.write(out)
req.write("")
return "\n"
def perform_request_log(req, date=""):
"""Display search log information for given date."""
req.content_type = "text/html"
req.send_http_header()
req.write("")
req.write("
Search Log
")
if date: # case A: display stats for a day
yyyymmdd = string.atoi(date)
req.write("
Date: %d
" % yyyymmdd)
req.write("""
""")
req.write("
%s
%s
%s
%s
%s
%s
" % ("No.", "Time", "Pattern", "Field", "Collection", "Number of Hits"))
# read file:
p = os.popen("grep ^%d %s/search.log" % (yyyymmdd, logdir), 'r')
lines = p.readlines()
p.close()
# process lines:
i = 0
for line in lines:
try:
datetime, as, p, f, c, nbhits = string.split(line,"#")
i += 1
req.write("
")
else: # case B: display summary stats per day
yyyymm01 = int(time.strftime("%Y%m01", time.localtime()))
yyyymmdd = int(time.strftime("%Y%m%d", time.localtime()))
req.write("""
""")
req.write("
%s
%s
" % ("Day", "Number of Queries"))
for day in range(yyyymm01, yyyymmdd + 1):
p = os.popen("grep -c ^%d %s/search.log" % (day, logdir), 'r')
for line in p.readlines():
req.write("""