diff --git a/invenio/legacy/bibindex/engine_utils.py b/invenio/legacy/bibindex/engine_utils.py
index 845a6c4d3..54671c77d 100644
--- a/invenio/legacy/bibindex/engine_utils.py
+++ b/invenio/legacy/bibindex/engine_utils.py
@@ -1,555 +1,555 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
 # Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """bibindex.engine_utils: here are some useful regular experssions for tokenizers
    and several helper functions.
 """
 
 
 import re
 import sys
 
 from invenio.base.helpers import utf8ifier
 
 from invenio.legacy.dbquery import run_sql, \
     DatabaseError
 from invenio.legacy.bibsched.bibtask import write_message
 from invenio.legacy.bibrecord import get_fieldvalues
 from invenio.config import \
      CFG_BIBINDEX_CHARS_PUNCTUATION, \
      CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS
 from invenio.legacy.bibindex.engine_config import CFG_BIBINDEX_COLUMN_VALUE_SEPARATOR
 
 
 latex_formula_re = re.compile(r'\$.*?\$|\\\[.*?\\\]')
 phrase_delimiter_re = re.compile(r'[\.:;\?\!]')
 space_cleaner_re = re.compile(r'\s+')
 re_block_punctuation_begin = re.compile(r"^" + CFG_BIBINDEX_CHARS_PUNCTUATION + "+")
 re_block_punctuation_end = re.compile(CFG_BIBINDEX_CHARS_PUNCTUATION + "+$")
 re_punctuation = re.compile(CFG_BIBINDEX_CHARS_PUNCTUATION)
 re_separators = re.compile(CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS)
 re_arxiv = re.compile(r'^arxiv:\d\d\d\d\.\d\d\d\d')
 
 re_pattern_fuzzy_author_trigger = re.compile(r'[\s\,\.]')
 # FIXME: re_pattern_fuzzy_author_trigger could be removed and an
 # BibAuthorID API function could be called instead after we
 # double-check that there are no circular imports.
 
 
 def load_tokenizers():
     """
     Load all the bibindex tokenizers and returns it.
     """
     import warnings
     warnings.warn("The function is deprecated. Please use the "
                   "`load_tokenizers()` from `invenio.modules.indexer.utils`",
                   DeprecationWarning)
     from invenio.modules.indexer.registry import tokenizers
     return dict((module.__name__.split('.')[-1],
         getattr(module, module.__name__.split('.')[-1], ''))
         for module in tokenizers)
 
 def get_all_index_names_and_column_values(column_name):
     """Returns a list of tuples of name and another column of all defined words indexes.
        Returns empty list in case there are no tags indexed in this index or in case
        the column name does not exist.
        Example: output=[('global', something), ('title', something)]."""
     out = []
     query = """SELECT name, %s FROM idxINDEX""" % column_name
     try:
         res = run_sql(query)
         for row in res:
             out.append((row[0], row[1]))
     except DatabaseError:
         write_message("Exception caught for SQL statement: %s; column %s might not exist" % (query, column_name), sys.stderr)
     return out
 
 
 def get_all_synonym_knowledge_bases():
     """Returns a dictionary of name key and knowledge base name and match type tuple value
         information of all defined words indexes that have knowledge base information.
         Returns empty dictionary in case there are no tags indexed.
         Example: output['global'] = ('INDEX-SYNONYM-TITLE', 'exact'), output['title'] = ('INDEX-SYNONYM-TITLE', 'exact')."""
     res = get_all_index_names_and_column_values("synonym_kbrs")
     out = {}
     for row in res:
         kb_data = row[1]
         # ignore empty strings
         if len(kb_data):
             out[row[0]] = tuple(kb_data.split(CFG_BIBINDEX_COLUMN_VALUE_SEPARATOR))
     return out
 
 
 def get_index_remove_stopwords(index_id):
     """Returns value of a remove_stopword field from idxINDEX database table
        if it's not 'No'. If it's 'No' returns False.
        Just for consistency with WordTable.
        @param index_id: id of the index
     """
     try:
         result = run_sql("SELECT remove_stopwords FROM idxINDEX WHERE ID=%s", (index_id, ))[0][0]
     except:
         return False
     if result == 'No' or result == '':
         return False
     return result
 
 
 def get_index_remove_html_markup(index_id):
     """ Gets remove_html_markup parameter from database ('Yes' or 'No') and
         changes it  to True, False.
         Just for consistency with WordTable."""
     try:
         result = run_sql("SELECT remove_html_markup FROM idxINDEX WHERE ID=%s", (index_id, ))[0][0]
     except:
         return False
     if result == 'Yes':
         return True
     return False
 
 
 def get_index_remove_latex_markup(index_id):
     """ Gets remove_latex_markup parameter from database ('Yes' or 'No') and
         changes it  to True, False.
         Just for consistency with WordTable."""
     try:
         result = run_sql("SELECT remove_latex_markup FROM idxINDEX WHERE ID=%s", (index_id, ))[0][0]
     except:
         return False
     if result == 'Yes':
         return True
     return False
 
 
 def author_name_requires_phrase_search(p):
     """
     Detect whether author query pattern p requires phrase search.
     Notably, look for presence of spaces and commas.
     """
     if re_pattern_fuzzy_author_trigger.search(p):
         return True
     return False
 
 
 def get_field_count(recID, tags):
     """
     Return number of field instances having TAGS in record RECID.
 
     @param recID: record ID
     @type recID: int
     @param tags: list of tags to count, e.g. ['100__a', '700__a']
     @type tags: list
     @return: number of tags present in record
     @rtype: int
     @note: Works internally via getting field values, which may not be
            very efficient.  Could use counts only, or else retrieve stored
            recstruct format of the record and walk through it.
     """
     out = 0
     for tag in tags:
         out += len(get_fieldvalues(recID, tag))
     return out
 
 
 def run_sql_drop_silently(query):
     """
         SQL DROP statement with IF EXISTS part generates
         warning if table does not exist. To mute the warning
         we can remove IF EXISTS and catch SQL exception telling
         us that table does not exist.
     """
     try:
         query = query.replace(" IF EXISTS", "")
         run_sql(query)
     except Exception as e:
         if  str(e).find("Unknown table") > -1:
             pass
         else:
             raise e
 
 
 from invenio.modules.indexer.utils import get_idx_indexer
 
 
 def get_all_indexes(virtual=True, with_ids=False):
     """Returns the list of the names of all defined words indexes.
        Returns empty list in case there are no tags indexed in this index.
        @param virtual: if True function will return also virtual indexes
        @param with_ids: if True function will return also IDs of found indexes
        Example: output=['global', 'author']."""
     out = []
     if virtual:
         query = """SELECT %s name FROM idxINDEX"""
         query = query % (with_ids and "id," or "")
     else:
         query = """SELECT %s w.name FROM idxINDEX AS w
                    WHERE w.id NOT IN (SELECT DISTINCT id_virtual FROM idxINDEX_idxINDEX)"""
         query = query % (with_ids and "w.id," or "")
     res = run_sql(query)
     if with_ids:
         out = [row for row in res]
     else:
         out = [row[0] for row in res]
     return out
 
 
 def get_all_virtual_indexes():
     """ Returns all defined 'virtual' indexes. """
     query = """SELECT DISTINCT v.id_virtual, w.name FROM idxINDEX_idxINDEX AS v,
                                                          idxINDEX AS w
                WHERE v.id_virtual=w.id"""
     res = run_sql(query)
     return res
 
 
 def get_index_virtual_indexes(index_id):
     """Returns 'virtual' indexes that should be indexed together with
        given index."""
     query = """SELECT v.id_virtual, w.name  FROM idxINDEX_idxINDEX AS v,
                                                  idxINDEX AS w
                WHERE v.id_virtual=w.id AND
                      v.id_normal=%s"""
     res = run_sql(query, (index_id,))
     return res
 
 
 def is_index_virtual(index_id):
     """Checks if index is virtual"""
     query = """SELECT id_virtual FROM idxINDEX_idxINDEX
                WHERE id_virtual=%s"""
     res = run_sql(query, (index_id,))
     if res:
         return True
     return False
 
 def filter_for_virtual_indexes(index_list):
     """
         Function removes all non-virtual indexes
         from given list of indexes.
         @param index_list: list of index names
     """
     try:
         virtual = zip(*get_all_virtual_indexes())[1]
         selected = set(virtual) & set(index_list)
         return list(selected)
     except IndexError:
         return []
     return []
 
 def get_virtual_index_building_blocks(index_id):
     """Returns indexes that made up virtual index of given index_id.
        If index_id is an id of normal index (not virtual) returns
        empty tuple.
        """
     query = """SELECT v.id_normal, w.name FROM idxINDEX_idxINDEX AS v,
                                                idxINDEX AS w
                WHERE v.id_normal=w.id AND
                      v.id_virtual=%s"""
     res = run_sql(query, (index_id,))
     return res
 
 
 def get_index_id_from_index_name(index_name):
     """Returns the words/phrase index id for INDEXNAME.
        Returns empty string in case there is no words table for this index.
        Example: field='author', output=4."""
     out = 0
     query = """SELECT w.id FROM idxINDEX AS w
                 WHERE w.name=%s LIMIT 1"""
     res = run_sql(query, (index_name,), 1)
     if res:
         out = res[0][0]
     return out
 
 
 def get_index_name_from_index_id(index_id):
     """Returns the words/phrase index name for INDEXID.
        Returns '' in case there is no words table for this indexid.
        Example: field=9, output='fulltext'."""
     res = run_sql("SELECT name FROM idxINDEX WHERE id=%s", (index_id,))
     if res:
         return res[0][0]
     return ''
 
 
 def get_field_tags(field, tagtype="marc"):
     """Returns a list of tags for the field code 'field'. Works
        for both MARC and nonMARC tags.
        Returns empty list in case of error.
        Example: field='author', output=['100__%','700__%'].
        @param tagtype: can be: "marc" or "nonmarc", default value
             is "marc" for backward compatibility
     """
     out = []
     query = """SELECT t.%s FROM tag AS t,
                                 field_tag AS ft,
                                 field AS f
                 WHERE f.code=%%s AND
                 ft.id_field=f.id AND
                 t.id=ft.id_tag
                 ORDER BY ft.score DESC"""
     if tagtype == "marc":
         query = query % "value"
         res = run_sql(query, (field,))
         return [row[0] for row in res]
     else:
         query = query % "recjson_value"
         res = run_sql(query, (field,))
         values = []
         for row in res:
             if row[0] is not None:
                 values.extend(row[0].split(","))
         return values
 
 
 def get_marc_tag_indexes(tag, virtual=True):
     """Returns indexes names and ids corresponding to the given tag
        @param tag: MARC tag in one of the forms:
             'xx%', 'xxx', 'xxx__a', 'xxx__%'
        @param virtual: if True function will also return virtual indexes"""
     tag2 = tag[0:2] + "%" #for tags in the form: 10%
     tag3 = tag[:-1] + "%" #for tags in the form: 100__%
     query = """SELECT DISTINCT w.id,w.name FROM idxINDEX AS w,
                                                 idxINDEX_field AS wf,
                                                 field_tag AS ft,
                                                 tag as t
                WHERE (t.value=%%s OR
                       t.value=%%s OR
                       %s) AND
                      t.id=ft.id_tag AND
                      ft.id_field=wf.id_field AND
                      wf.id_idxINDEX=w.id"""
     if tag[-1] == "%":
         missing_piece = "t.value LIKE %s"
     elif tag[-1] != "%" and len(tag) == 3:
         missing_piece = "t.value LIKE %s"
         tag3 = tag + "%" #for all tags which start from 'tag'
     else:
         missing_piece = "t.value=%s"
     query = query % missing_piece
     res = run_sql(query, (tag, tag2, tag3))
     if res:
         if virtual:
             response = list(res)
             index_ids = map(str, zip(*res)[0])
             query = """SELECT DISTINCT v.id_virtual,w.name FROM idxINDEX_idxINDEX AS v,
                                                                 idxINDEX as w
                        WHERE v.id_virtual=w.id AND
                              v.id_normal IN ("""
             query = query + ", ".join(index_ids) + ")"
             response.extend(run_sql(query))
             return tuple(response)
         return res
     return ()
 
 
 def get_nonmarc_tag_indexes(nonmarc_tag, virtual=True):
     """Returns index names and ids corresponding to the given nonmarc tag
        (nonmarc tag can be also called 'bibfield field').
        If param 'virtual' is set to True function will also return
        virtual indexes"""
     query = """SELECT DISTINCT w.id, w.name FROM idxINDEX AS w,
                                                  idxINDEX_field AS wf,
                                                  field_tag AS ft,
                                                  tag as t
                WHERE (t.recjson_value LIKE %s OR
                       t.recjson_value LIKE %s OR
                       t.recjson_value LIKE %s OR
                       t.recjson_value=%s) AND
                      t.id=ft.id_tag AND
                      ft.id_field=wf.id_field AND
                      wf.id_idxINDEX=w.id"""
 
     at_the_begining = nonmarc_tag + ',%%'
     in_the_middle = '%%,' + nonmarc_tag + ',%%'
     at_the_end = '%%,' + nonmarc_tag
 
     res = run_sql(query, (at_the_begining, in_the_middle, at_the_end, nonmarc_tag))
     if res:
         if virtual:
             response = list(res)
             index_ids = map(str, zip(*res)[0])
             query = """SELECT DISTINCT v.id_virtual,w.name FROM idxINDEX_idxINDEX AS v,
                                                                 idxINDEX as w
                        WHERE v.id_virtual=w.id AND
                              v.id_normal IN ("""
             query = query + ", ".join(index_ids) + ")"
             response.extend(run_sql(query))
             return tuple(response)
         return res
     return ()
 
 
 def get_index_tags(indexname, virtual=True, tagtype="marc"):
     """Returns the list of tags that are indexed inside INDEXNAME.
        Returns empty list in case there are no tags indexed in this index.
        Note: uses get_field_tags() defined before.
        Example: field='author', output=['100__%', '700__%'].
        @param tagtype: can be: "marc" or "nonmarc", default value
             is "marc" for backward compatibility
     """
     out = []
     query = """SELECT f.code FROM idxINDEX AS w,
                                   idxINDEX_field AS wf,
                                   field AS f
                WHERE w.name=%s AND
                      w.id=wf.id_idxINDEX AND
                      f.id=wf.id_field"""
     res = run_sql(query, (indexname,))
     for row in res:
         out.extend(get_field_tags(row[0], tagtype))
     if not out and virtual:
         index_id = get_index_id_from_index_name(indexname)
         try:
             dependent_indexes = map(str, zip(*get_virtual_index_building_blocks(index_id))[0])
         except IndexError:
             return out
         tags = set()
         query = """SELECT DISTINCT f.code FROM idxINDEX AS w, idxINDEX_field AS wf, field AS f
                    WHERE w.id=wf.id_idxINDEX AND
                          f.id=wf.id_field AND
                          w.id IN ("""
         query = query + ", ".join(dependent_indexes) + ")"
         res = run_sql(query)
         for row in res:
             tags |= set(get_field_tags(row[0], tagtype))
         out = list(tags)
     out = [tag for tag in out if tag]
     return out
 
 
 def get_min_last_updated(indexes):
     """Returns min modification date for 'indexes':
        min(last_updated)
        @param indexes: list of indexes
     """
     query= """SELECT min(last_updated) FROM idxINDEX WHERE name IN ("""
     for index in indexes:
         query += "%s,"
     query = query[:-1] + ")"
     res = run_sql(query, tuple(indexes))
     return res
 
 
 def remove_inexistent_indexes(indexes, leave_virtual=False):
     """Removes indexes that don't exist from the given list of indexes.
        @param indexes: list of indexes
        @param leave_virtual: should we leave virtual indexes in the list?
     """
     correct_indexes = get_all_indexes(leave_virtual)
     cleaned = []
     for index in indexes:
         if index in correct_indexes:
             cleaned.append(index)
     return cleaned
 
 
 def get_records_range_for_index(index_id):
     """
         Get records range for given index.
     """
     try:
         query = """SELECT min(id_bibrec), max(id_bibrec) FROM idxWORD%02dR""" % index_id
         resp = run_sql(query)
         if resp:
             return resp[0]
         return None
     except Exception:
         return None
 
 
 def make_prefix(index_name):
     """
         Creates a prefix for specific index which is added
         to every word from this index stored in reversed table
         of corresponding virtual index.
         @param index_name: name of the dependent index we want to create prefix for
     """
     return "__" + index_name + "__"
 
 
 class UnknownTokenizer(Exception):
     pass
 
 
 def list_union(list1, list2):
     "Returns union of the two lists."
     union_dict = {}
     for e in list1:
         union_dict[e] = 1
     for e in list2:
         union_dict[e] = 1
     return union_dict.keys()
 
 
 def get_index_fields(index_id):
     """Returns fields that are connected to index specified by
        index_id.
     """
     query = """SELECT f.id, f.name FROM field as f,
                                         idxINDEX as w,
                                         idxINDEX_field as wf
                WHERE f.id=wf.id_field AND
                      wf.id_idxINDEX=w.id AND
                      w.id=%s
             """
     index_fields = run_sql(query, (index_id, ) )
     return index_fields
 
 
 def recognize_marc_tag(tag):
     """Checks if tag is a MARC tag or not"""
     tag_len = len(tag)
     if 3 <= tag_len <= 6  and tag[0:3].isdigit():
         return True
     if tag_len == 3 and tag[0:2].isdigit() and tag[2] == '%':
         return True
     return False
 
 
 def _is_collection(subfield):
     """Checks if a type is a collection;
        get_values_recursively internal function."""
     return hasattr(subfield, '__iter__')
 
 
 def _get_values(subfield):
     """Returns values of a subfield suitable for later tokenizing;
        get_values_recursively internal function."""
     if type(subfield) == dict:
         return subfield.values()
     else:
         return subfield
 
 
 def get_values_recursively(subfield, phrases):
     """Finds all values suitable for later tokenizing in
        field/subfield of bibfield record.
        @param subfield: name of the field/subfield
        @param phrases: container for phrases (for example empty list)
 
        FIXME: move this function to bibfield!
        As soon as possible. Note that journal tokenizer
        also needs to be changed.
     """
     if _is_collection(subfield):
         for s in _get_values(subfield):
             get_values_recursively(s, phrases)
     elif subfield is not None:
         phrases.append(utf8ifier(subfield))
diff --git a/invenio/legacy/bibindex/engine_washer.py b/invenio/legacy/bibindex/engine_washer.py
index ed1484127..2a31f8d94 100644
--- a/invenio/legacy/bibindex/engine_washer.py
+++ b/invenio/legacy/bibindex/engine_washer.py
@@ -1,169 +1,169 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2009, 2010, 2011, 2012, 2013 CERN.
+# Copyright (C) 2009, 2010, 2011, 2012, 2013, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 import re
 from invenio.legacy.bibindex.engine_stemmer import stem
 from invenio.legacy.bibindex.engine_stopwords import is_stopword
 from invenio.config import CFG_BIBINDEX_MIN_WORD_LENGTH, \
      CFG_ETCDIR
 
 re_pattern_fuzzy_author_dots = re.compile(r'[\.\-]+')
 re_pattern_fuzzy_author_spaces = re.compile(r'\s+')
 re_pattern_author_canonical_id = re.compile(r'\.[0-9]+$')
 
 
 re_unicode_lowercase_a = re.compile(unicode(r"(?u)[áàäâãå]", "utf-8"))
 re_unicode_lowercase_ae = re.compile(unicode(r"(?u)[æ]", "utf-8"))
 re_unicode_lowercase_e = re.compile(unicode(r"(?u)[éèëê]", "utf-8"))
 re_unicode_lowercase_i = re.compile(unicode(r"(?u)[íìïî]", "utf-8"))
 re_unicode_lowercase_o = re.compile(unicode(r"(?u)[óòöôõø]", "utf-8"))
 re_unicode_lowercase_u = re.compile(unicode(r"(?u)[úùüû]", "utf-8"))
 re_unicode_lowercase_y = re.compile(unicode(r"(?u)[ýÿ]", "utf-8"))
 re_unicode_lowercase_c = re.compile(unicode(r"(?u)[çć]", "utf-8"))
 re_unicode_lowercase_n = re.compile(unicode(r"(?u)[ñ]", "utf-8"))
 re_unicode_uppercase_a = re.compile(unicode(r"(?u)[ÁÀÄÂÃÅ]", "utf-8"))
 re_unicode_uppercase_ae = re.compile(unicode(r"(?u)[Æ]", "utf-8"))
 re_unicode_uppercase_e = re.compile(unicode(r"(?u)[ÉÈËÊ]", "utf-8"))
 re_unicode_uppercase_i = re.compile(unicode(r"(?u)[ÍÌÏÎ]", "utf-8"))
 re_unicode_uppercase_o = re.compile(unicode(r"(?u)[ÓÒÖÔÕØ]", "utf-8"))
 re_unicode_uppercase_u = re.compile(unicode(r"(?u)[ÚÙÜÛ]", "utf-8"))
 re_unicode_uppercase_y = re.compile(unicode(r"(?u)[Ý]", "utf-8"))
 re_unicode_uppercase_c = re.compile(unicode(r"(?u)[ÇĆ]", "utf-8"))
 re_unicode_uppercase_n = re.compile(unicode(r"(?u)[Ñ]", "utf-8"))
 re_latex_lowercase_a = re.compile("\\\\[\"H'`~^vu=k]\{?a\}?")
 re_latex_lowercase_ae = re.compile("\\\\ae\\{\\}?")
 re_latex_lowercase_e = re.compile("\\\\[\"H'`~^vu=k]\\{?e\\}?")
 re_latex_lowercase_i = re.compile("\\\\[\"H'`~^vu=k]\\{?i\\}?")
 re_latex_lowercase_o = re.compile("\\\\[\"H'`~^vu=k]\\{?o\\}?")
 re_latex_lowercase_u = re.compile("\\\\[\"H'`~^vu=k]\\{?u\\}?")
 re_latex_lowercase_y = re.compile("\\\\[\"']\\{?y\\}?")
 re_latex_lowercase_c = re.compile("\\\\['uc]\\{?c\\}?")
 re_latex_lowercase_n = re.compile("\\\\[c'~^vu]\\{?n\\}?")
 re_latex_uppercase_a = re.compile("\\\\[\"H'`~^vu=k]\\{?A\\}?")
 re_latex_uppercase_ae = re.compile("\\\\AE\\{?\\}?")
 re_latex_uppercase_e = re.compile("\\\\[\"H'`~^vu=k]\\{?E\\}?")
 re_latex_uppercase_i = re.compile("\\\\[\"H'`~^vu=k]\\{?I\\}?")
 re_latex_uppercase_o = re.compile("\\\\[\"H'`~^vu=k]\\{?O\\}?")
 re_latex_uppercase_u = re.compile("\\\\[\"H'`~^vu=k]\\{?U\\}?")
 re_latex_uppercase_y = re.compile("\\\\[\"']\\{?Y\\}?")
 re_latex_uppercase_c = re.compile("\\\\['uc]\\{?C\\}?")
 re_latex_uppercase_n = re.compile("\\\\[c'~^vu]\\{?N\\}?")
 
 def lower_index_term(term):
     """
     Return safely lowered index term TERM.  This is done by converting
     to UTF-8 first, because standard Python lower() function is not
     UTF-8 safe.  To be called by both the search engine and the
     indexer when appropriate (e.g. before stemming).
 
     In case of problems with UTF-8 compliance, this function raises
     UnicodeDecodeError, so the client code may want to catch it.
     """
     return unicode(term, 'utf-8').lower().encode('utf-8')
 
 latex_markup_re = re.compile(r"\\begin(\[.+?\])?\{.+?\}|\\end\{.+?}|\\\w+(\[.+?\])?\{(?P<inside1>.*?)\}|\{\\\w+ (?P<inside2>.*?)\}")
 def remove_latex_markup(phrase):
     ret_phrase = ''
     index = 0
     for match in latex_markup_re.finditer(phrase):
         ret_phrase += phrase[index:match.start()]
         ret_phrase += match.group('inside1') or match.group('inside2') or ''
         index = match.end()
     ret_phrase += phrase[index:]
     return ret_phrase
 
 
 def apply_stemming(word, stemming_language):
     """Returns word after applying stemming (if stemming language is set).
        You can change your stemming language in database.
 
        @param word: word to be checked
        @type word: str
        @param stemming_language: abbreviation of language or None
        @type stemming_language: str
     """
     if stemming_language:
         word = stem(word, stemming_language)
     return word
 
 
 def remove_stopwords(word, stopwords_kb=None):
     """Returns word after stopword check.
        One must specify the name of the knowledge base.
 
        @param word: word to be checked
        @type word: str
        @param stopwords_kb: name of the stopwords knowledge base
        @type word: str
     """
     if stopwords_kb is not None:
         if is_stopword(word, stopwords_kb):
             return ""
     return word
 
 def length_check(word):
     """Returns word after length check.
 
        @param word: word to be checked
        @type word: str
     """
     if len(word) < CFG_BIBINDEX_MIN_WORD_LENGTH:
         return ""
     return word
 
 def wash_index_term(term, max_char_length=50, lower_term=True):
     """
     Return washed form of the index term TERM that would be suitable
     for storing into idxWORD* tables.  I.e., lower the TERM if
     LOWER_TERM is True, and truncate it safely to MAX_CHAR_LENGTH
     UTF-8 characters (meaning, in principle, 4*MAX_CHAR_LENGTH bytes).
 
     The function works by an internal conversion of TERM, when needed,
     from its input Python UTF-8 binary string format into Python
     Unicode format, and then truncating it safely to the given number
     of UTF-8 characters, without possible mis-truncation in the middle
     of a multi-byte UTF-8 character that could otherwise happen if we
     would have been working with UTF-8 binary representation directly.
 
     Note that MAX_CHAR_LENGTH corresponds to the length of the term
     column in idxINDEX* tables.
     """
     if lower_term:
         washed_term = unicode(term, 'utf-8').lower()
     else:
         washed_term = unicode(term, 'utf-8')
     if len(washed_term) <= max_char_length:
         # no need to truncate the term, because it will fit
         # nicely even if it uses four-byte UTF-8 characters
         return washed_term.encode('utf-8')
     else:
         # truncate the term in a safe position:
         return washed_term[:max_char_length].encode('utf-8')
 
 def wash_author_name(p):
     """
     Wash author name suitable for author searching.  Notably, replace
     dots and hyphens with spaces, and collapse spaces.
     """
     if re_pattern_author_canonical_id.search(p):
         # we have canonical author ID form, so ignore all washing
         return p
     out = re_pattern_fuzzy_author_dots.sub(" ", p)
     out = re_pattern_fuzzy_author_spaces.sub(" ", out)
     return out.strip()
diff --git a/invenio/modules/formatter/templates/format/record/Default_HTML_detailed.tpl b/invenio/modules/formatter/templates/format/record/Default_HTML_detailed.tpl
index 903519278..31d66cd72 100644
--- a/invenio/modules/formatter/templates/format/record/Default_HTML_detailed.tpl
+++ b/invenio/modules/formatter/templates/format/record/Default_HTML_detailed.tpl
@@ -1,56 +1,56 @@
 {#
 # This file is part of Invenio.
-# Copyright (C) 2014 CERN.
+# Copyright (C) 2014, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 #}
 
 {% extends "format/record/Default_HTML_detailed_base.tpl" %}
 
 {% block header %}
     {{ bfe_topbanner(bfo, prefix='<div style="padding-left:10px;padding-right:10px">', suffix='</div><hr/>') }}
     {{ bfe_title(bfo, separator="<br /><br />") }}
 {% endblock %}
 
 {% block details %}
     {{ bfe_authors(bfo, suffix="<br />", limit="25", interactive="yes", print_affiliations="yes", affiliation_prefix="<small> (", affiliation_suffix=")</small>") }}
     {{ bfe_addresses(bfo) }}
     {{ bfe_affiliation(bfo) }}
     {{ bfe_date(bfo, prefix="<br />", suffix="<br />") }}
     {{ bfe_publisher(bfo, prefix="<small>", suffix="</small>") }}
     {{ bfe_place(bfo, prefix="<small>", suffix="</small>") }}
     {{ bfe_isbn(bfo, prefix="<br />ISBN: ") }}
 {% endblock %}
 
 {% block abstract %}
     {{ bfe_abstract(bfo, prefix_en="<small><strong>Abstract: </strong>", prefix_fr="<small><strong>R&eacute;sum&eacute;: </strong>", suffix_en="</small><br />", suffix_fr="</small><br />") }}
 
     {{ bfe_keywords(bfo, prefix="<br /><small><strong>Keyword(s): </strong></small>", keyword_prefix="<small>", keyword_suffix="</small>") }}
 
     {{ bfe_notes(bfo, note_prefix="<br /><small><strong>Note: </strong>", note_suffix=" </small>", suffix="<br />") }}
 
     {{ bfe_publi_info(bfo, prefix="<br /><br /><strong>Published in: </strong>") }}<br />
     {{ bfe_doi(bfo, prefix="<small><strong>DOI: </strong>", suffix=" </small><br />") }}
 
     {{ bfe_plots(bfo, width="200px", caption="no") }}
 {% endblock %}
 
 {% block footer %}
     {{ bfe_appears_in_collections(bfo, prefix="<p style='margin-left: 10px;'><em>The record appears in these collections:</em><br />", suffix="</p>") }}
 
     {# WebTags #}
     {{ tfn_webtag_record_tags(record.get('recid'), current_user.get_id())|prefix('<hr />') }}
     {{ tfn_get_back_to_search_links(record.get('recid'))|wrap(prefix='<div class="pull-right linksbox">', suffix='</div>') }}
 {% endblock %}
diff --git a/invenio/modules/indexer/testsuite/test_indexer_engine_tokenizer.py b/invenio/modules/indexer/testsuite/test_indexer_engine_tokenizer.py
index 78d5d40ab..9d9e9076c 100644
--- a/invenio/modules/indexer/testsuite/test_indexer_engine_tokenizer.py
+++ b/invenio/modules/indexer/testsuite/test_indexer_engine_tokenizer.py
@@ -1,369 +1,369 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2010, 2011, 2012, 2013 CERN.
+# Copyright (C) 2010, 2011, 2012, 2013, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 """bibindex_engine_tokenizer_tests - unit tests for tokenizers
 
 There should always be at least one test class for each class in b_e_t.
 """
 
 from invenio.base.wrappers import lazy_import
 from invenio.testsuite import make_test_suite, run_test_suite, InvenioTestCase
 load_tokenizers = lazy_import('invenio.legacy.bibindex.engine_utils:load_tokenizers')
 
 _TOKENIZERS = None
 
 
 class TestAuthorTokenizerScanning(InvenioTestCase):
     """Test BibIndex name tokenization"""
 
     def setUp(self):
         _TOKENIZERS = load_tokenizers()
         self.tokenizer = _TOKENIZERS["BibIndexAuthorTokenizer"]()
         self.scan = self.tokenizer.scan_string_for_phrases
 
     def test_bifnt_scan_single(self):
         """BibIndexAuthorTokenizer - scanning single names like 'Dido'"""
         teststr = "Dido"
         output = self.scan(teststr)
         anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Dido'], 'nonlastnames': [], 'titles': [], 'raw' : teststr}
         self.assertEqual(output, anticipated)
 
     def test_bifnt_scan_simple_western_forward(self):
         """BibIndexAuthorTokenizer - scanning simple Western-style: first last"""
         teststr = "Ringo Starr"
         output = self.scan(teststr)
         anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Starr'], 'nonlastnames': ['Ringo'], 'titles': [], 'raw' : teststr}
         self.assertEqual(output, anticipated)
 
     def test_bifnt_scan_simple_western_reverse(self):
         """BibIndexAuthorTokenizer - scanning simple Western-style: last, first"""
         teststr = "Starr, Ringo"
         output = self.scan(teststr)
         anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Starr'], 'nonlastnames': ['Ringo'], 'titles': [], 'raw' : teststr}
         self.assertEqual(output, anticipated)
 
     def test_bifnt_scan_multiname_forward(self):
         """BibIndexAuthorTokenizer - scanning multiword: first middle last"""
         teststr = "Michael Edward Peskin"
         output = self.scan(teststr)
         anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'],
                        'lastnames': ['Peskin'], 'nonlastnames': ['Michael', 'Edward'], 'titles': [], 'raw' : teststr}
         self.assertEqual(output, anticipated)
 
     def test_bifnt_scan_multiname_dotcrammed(self):
         """BibIndexAuthorTokenizer - scanning multiword: f.m. last"""
         teststr = "M.E. Peskin"
         output = self.scan(teststr)
         anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'],
                        'lastnames': ['Peskin'], 'nonlastnames': ['M', 'E'], 'titles': [], 'raw' : teststr}
         self.assertEqual(output, anticipated)
 
     def test_bifnt_scan_multiname_dotcrammed_reversed(self):
         """BibIndexAuthorTokenizer - scanning multiword: last, f.m."""
         teststr = "Peskin, M.E."
         output = self.scan(teststr)
         anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'],
                        'lastnames': ['Peskin'], 'nonlastnames': ['M', 'E'], 'titles': [], 'raw' : teststr}
         self.assertEqual(output, anticipated)
 
     def test_bifnt_scan_multiname_dashcrammed(self):
         """BibIndexAuthorTokenizer - scanning multiword: first-middle last"""
         teststr = "Jean-Luc Picard"
         output = self.scan(teststr)
         anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'],
                        'lastnames': ['Picard'], 'nonlastnames': ['Jean', 'Luc'], 'titles': [], 'raw' : teststr}
         self.assertEqual(output, anticipated)
 
     def test_bifnt_scan_multiname_dashcrammed_reversed(self):
         """BibIndexAuthorTokenizer - scanning multiword: last, first-middle"""
         teststr = "Picard, Jean-Luc"
         output = self.scan(teststr)
         anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'],
                        'lastnames': ['Picard'], 'nonlastnames': ['Jean', 'Luc'], 'titles': [], 'raw' : teststr}
         self.assertEqual(output, anticipated)
 
     def test_bifnt_scan_compound_lastname_dashes(self):
         """BibIndexAuthorTokenizer - scanning multiword: first middle last-last"""
         teststr = "Cantina Octavia Jones-Smith"
         output = self.scan(teststr)
         anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'],
                        'lastnames': ['Jones', 'Smith'], 'nonlastnames': ['Cantina', 'Octavia'], 'titles': [], 'raw' : teststr}
         self.assertEqual(output, anticipated)
 
     def test_bifnt_scan_compound_lastname_dashes_reverse(self):
         """BibIndexAuthorTokenizer - scanning multiword: last-last, first middle"""
         teststr = "Jones-Smith, Cantina Octavia"
         output = self.scan(teststr)
         anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'],
                        'lastnames': ['Jones', 'Smith'], 'nonlastnames': ['Cantina', 'Octavia'], 'titles': [], 'raw' : teststr}
         self.assertEqual(output, anticipated)
 
     def test_bifnt_scan_compound_lastname_reverse(self):
         """BibIndexAuthorTokenizer - scanning compound last: last last, first"""
         teststr = "Alvarez Gaume, Joachim"
         output = self.scan(teststr)
         anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'],
                        'lastnames': ['Alvarez', 'Gaume'], 'nonlastnames': ['Joachim'], 'titles': [], 'raw' : teststr}
         self.assertEqual(output, anticipated)
 
     def test_bifnt_scan_titled(self):
         """BibIndexAuthorTokenizer - scanning title-bearing: last, first, title"""
         teststr = "Epstein, Brian, The Fifth Beatle"
         output = self.scan(teststr)
         anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'],
                        'lastnames': ['Epstein'], 'nonlastnames': ['Brian'], 'titles': ['The Fifth Beatle'], 'raw' : teststr}
         self.assertEqual(output, anticipated)
 
     def test_bifnt_scan_wildly_interesting(self):
         """BibIndexAuthorTokenizer - scanning last last last, first first, title, title"""
         teststr = "Ibanez y Gracia, Maria Luisa, II., ed."
         output = self.scan(teststr)
         anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'],
                        'lastnames': ['Ibanez', 'y', 'Gracia'], 'nonlastnames': ['Maria', 'Luisa'], 'titles': ['II.', 'ed.'], 'raw' : teststr}
         self.assertEqual(output, anticipated)
 
 
 class TestAuthorTokenizerTokens(InvenioTestCase):
     """Test BibIndex name variant token generation from scanned and tagged sets"""
 
     def setUp(self):
         _TOKENIZERS = load_tokenizers()
         self.tokenizer = _TOKENIZERS["BibIndexAuthorTokenizer"]()
         self.get_index_tokens = self.tokenizer.parse_scanned_for_phrases
 
     def test_bifnt_tokenize_single(self):
         """BibIndexAuthorTokenizer - tokens for single-word name
 
         Ronaldo
         """
         tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'],
                  'lastnames': ['Ronaldo'], 'nonlastnames': [], 'titles': [], 'raw' : 'Ronaldo'}
         output = self.get_index_tokens(tagged_data)
         anticipated = ['Ronaldo']
         self.assertEqual(output, anticipated)
 
     def test_bifnt_tokenize_simple_forward(self):
         """BibIndexAuthorTokenizer - tokens for first last
 
         Ringo Starr
         """
         tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'],
                  'lastnames': ['Starr'], 'nonlastnames': ['Ringo'], 'titles': [], 'raw' : 'Ringo Starr'}
         output = self.get_index_tokens(tagged_data)
         anticipated = ['R Starr', 'Ringo Starr', 'Starr, R', 'Starr, Ringo']
         self.assertEqual(output, anticipated)
 
     def test_bifnt_tokenize_simple_reverse(self):
         """BibIndexAuthorTokenizer - tokens for last, first
 
         Starr, Ringo
         """
         tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'],
                  'lastnames': ['Starr'], 'nonlastnames': ['Ringo'], 'titles': [], 'raw' : 'Starr, Ringo'}
         output = self.get_index_tokens(tagged_data)
         anticipated = ['R Starr', 'Ringo Starr', 'Starr, R', 'Starr, Ringo']
         self.assertEqual(output, anticipated)
 
     def test_bifnt_tokenize_twoname_forward(self):
         """BibIndexAuthorTokenizer - tokens for first middle last
 
         Michael Edward Peskin
         """
         tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'],
                  'lastnames': ['Peskin'], 'nonlastnames': ['Michael', 'Edward'], 'titles': [], 'raw' : 'Michael Edward Peskin'}
         output = self.get_index_tokens(tagged_data)
         anticipated = ['E Peskin', 'Edward Peskin', 'M E Peskin', 'M Edward Peskin', 'M Peskin',
                        'Michael E Peskin', 'Michael Edward Peskin', 'Michael Peskin',
                        'Peskin, E', 'Peskin, Edward', 'Peskin, M',
                        'Peskin, M E', 'Peskin, M Edward', 'Peskin, Michael',
                        'Peskin, Michael E', 'Peskin, Michael Edward']
         self.assertEqual(output, anticipated)
 
     def test_bifnt_tokenize_compound_last(self):
         """BibIndexAuthorTokenizer - tokens for last last, first
 
         Alvarez Gaume, Joachim
         """
         tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'],
                  'lastnames': ['Alvarez', 'Gaume'], 'nonlastnames': ['Joachim'], 'titles': [], 'raw' : 'Alvarez Gaume, Joachim'}
         output = self.get_index_tokens(tagged_data)
         anticipated = ['Alvarez Gaume, J', 'Alvarez Gaume, Joachim', 'Alvarez, J', 'Alvarez, Joachim', 'Gaume, J',
                        'Gaume, Joachim', 'J Alvarez', 'J Alvarez Gaume', 'J Gaume', 'Joachim Alvarez',
                        'Joachim Alvarez Gaume', 'Joachim Gaume']
         self.assertEqual(output, anticipated)
 
     def test_bifnt_tokenize_titled(self):
         """BibIndexAuthorTokenizer - tokens for last, first, title
 
         Epstein, Brian, The Fifth Beatle
         """
         tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'],
                  'lastnames': ['Epstein'], 'nonlastnames': ['Brian'], 'titles': ['The Fifth Beatle'], 'raw' : 'Epstein, Brian, The Fifth Beatle'}
         output = self.get_index_tokens(tagged_data)
         anticipated = ['B Epstein', 'B Epstein, The Fifth Beatle', 'Brian Epstein',
                        'Brian Epstein, The Fifth Beatle', 'Epstein, B', 'Epstein, B, The Fifth Beatle',
                        'Epstein, Brian', 'Epstein, Brian, The Fifth Beatle']
         self.assertEqual(output, anticipated)
 
     def test_bifnt_tokenize_wildly_interesting(self):
         """BibIndexAuthorTokenizer - tokens for last last last, first first, title, title
 
         Ibanez y Gracia, Maria Luisa, II, (ed.)
         """
         tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'],
                  'lastnames': ['Ibanez', 'y', 'Gracia'], 'nonlastnames': ['Maria', 'Luisa'], 'titles': ['II', '(ed.)'], 'raw' : 'Ibanez y Gracia, Maria Luisa, II, (ed.)'}
         output = self.get_index_tokens(tagged_data)
         anticipated = ['Gracia, L', 'Gracia, Luisa', 'Gracia, M', 'Gracia, M L', 'Gracia, M Luisa',
             'Gracia, Maria', 'Gracia, Maria L', 'Gracia, Maria Luisa',
             'Ibanez y Gracia, L', 'Ibanez y Gracia, L, II',
             'Ibanez y Gracia, Luisa', 'Ibanez y Gracia, Luisa, II',
             'Ibanez y Gracia, M', 'Ibanez y Gracia, M L', 'Ibanez y Gracia, M L, II',
             'Ibanez y Gracia, M Luisa', 'Ibanez y Gracia, M Luisa, II',
             'Ibanez y Gracia, M, II',
             'Ibanez y Gracia, Maria',
             'Ibanez y Gracia, Maria L', 'Ibanez y Gracia, Maria L, II',
             'Ibanez y Gracia, Maria Luisa', 'Ibanez y Gracia, Maria Luisa, II',
             'Ibanez y Gracia, Maria, II',
             'Ibanez, L', 'Ibanez, Luisa',
             'Ibanez, M', 'Ibanez, M L', 'Ibanez, M Luisa', 'Ibanez, Maria',
             'Ibanez, Maria L', 'Ibanez, Maria Luisa', 'L Gracia', 'L Ibanez',
             'L Ibanez y Gracia', 'L Ibanez y Gracia, II', 'Luisa Gracia', 'Luisa Ibanez',
             'Luisa Ibanez y Gracia', 'Luisa Ibanez y Gracia, II', 'M Gracia',
             'M Ibanez', 'M Ibanez y Gracia', 'M Ibanez y Gracia, II', 'M L Gracia',
             'M L Ibanez', 'M L Ibanez y Gracia', 'M L Ibanez y Gracia, II',
             'M Luisa Gracia', 'M Luisa Ibanez', 'M Luisa Ibanez y Gracia', 'M Luisa Ibanez y Gracia, II',
             'Maria Gracia',
             'Maria Ibanez', 'Maria Ibanez y Gracia', 'Maria Ibanez y Gracia, II',
             'Maria L Gracia', 'Maria L Ibanez', 'Maria L Ibanez y Gracia', 'Maria L Ibanez y Gracia, II',
             'Maria Luisa Gracia', 'Maria Luisa Ibanez', 'Maria Luisa Ibanez y Gracia',
             'Maria Luisa Ibanez y Gracia, II']
         self.assertEqual(output, anticipated)
 
     def test_bifnt_tokenize_multimiddle_forward(self):
         """BibIndexAuthorTokenizer - tokens for first middle middle last
 
         W K H Panofsky
         """
         tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'],
                  'lastnames': ['Panofsky'], 'nonlastnames': ['W', 'K', 'H'], 'titles': [], 'raw' : 'W K H Panofsky'}
         output = self.get_index_tokens(tagged_data)
         anticipated = ['H Panofsky', 'K H Panofsky', 'K Panofsky', 'Panofsky, H', 'Panofsky, K',
                        'Panofsky, K H', 'Panofsky, W', 'Panofsky, W H', 'Panofsky, W K',
                        'Panofsky, W K H', 'W H Panofsky',
                        'W K H Panofsky', 'W K Panofsky', 'W Panofsky']
         self.assertEqual(output, anticipated)
 
     def test_tokenize(self):
         """BibIndexAuthorTokenizer - check tokenize_for_phrases()
 
         Ringo Starr
         """
         teststr = "Ringo Starr"
         output = self.tokenizer.tokenize_for_phrases(teststr)
         anticipated = ['R Starr', 'Ringo Starr', 'Starr, R', 'Starr, Ringo']
         self.assertEqual(output, anticipated)
 
 
 
 class TestExactAuthorTokenizer(InvenioTestCase):
     """Test exact author name tokenizer."""
 
     def setUp(self):
         """setup"""
         _TOKENIZERS = load_tokenizers()
         self.tokenizer = _TOKENIZERS["BibIndexExactAuthorTokenizer"]()
         self.tokenize = self.tokenizer.tokenize_for_phrases
 
     def test_exact_author_name_tokenizer_bare(self):
         """BibIndexExactNameTokenizer - bare name"""
         self.assertEqual(self.tokenize('John Doe'),
                          ['John Doe'])
 
     def test_exact_author_name_tokenizer_dots(self):
         """BibIndexExactNameTokenizer - name with dots"""
         self.assertEqual(self.tokenize('J. Doe'),
                          ['J Doe'])
         self.assertEqual(self.tokenize('J.R. Doe'),
                          ['J R Doe'])
         self.assertEqual(self.tokenize('J. R. Doe'),
                          ['J R Doe'])
 
     def test_exact_author_name_tokenizer_trailing_dots(self):
         """BibIndexExactNameTokenizer - name with trailing dots"""
         self.assertEqual(self.tokenize('Doe, J'),
                          ['Doe, J'])
         self.assertEqual(self.tokenize('Doe, J.'),
                          ['Doe, J'])
 
     def test_exact_author_name_tokenizer_hyphens(self):
         """BibIndexExactNameTokenizer - name with hyphens"""
         self.assertEqual(self.tokenize('Doe, Jean-Pierre'),
                          ['Doe, Jean Pierre'])
 
 
 
 class TestCJKTokenizer(InvenioTestCase):
     """Tests for CJK Tokenizer which splits CJK words into characters and treats
        every single character as a word"""
 
 
     @classmethod
     def setUp(self):
         _TOKENIZERS = load_tokenizers()
         self.tokenizer = _TOKENIZERS["BibIndexCJKTokenizer"]()
 
     def test_tokenize_for_words_phrase_galaxy(self):
         """tokenizing phrase: galaxy s4据信"""
         phrase = "galaxy s4据信"
         result = self.tokenizer.tokenize_for_words(phrase)
         self.assertEqual(sorted(['galaxy','s4','据','信']), sorted(result))
 
     def test_tokenize_for_words_phrase_with_special_punctuation(self):
         """tokenizing phrase: 马英九：台湾民"""
         phrase = u"马英九：台湾民"
         result = self.tokenizer.tokenize_for_words(phrase)
         self.assertEqual(sorted(['马','英','九','台','湾','民']), sorted(result))
 
     def test_tokenize_for_words_phrase_with_special_punctuation_two(self):
         """tokenizing phrase: 色的“刀子嘴”"""
         phrase = u"色的“刀子嘴”"
         result = self.tokenizer.tokenize_for_words(phrase)
         self.assertEqual(sorted(['色','的','刀','子','嘴']), sorted(result))
 
     def test_tokenize_for_words_simple_phrase(self):
         """tokenizing phrase: 春眠暁覚"""
         self.assertEqual(sorted(self.tokenizer.tokenize_for_words(u'春眠暁覚')), sorted(['春', '眠', '暁', '覚']))
 
     def test_tokenize_for_words_mixed_phrase(self):
         """tokenizing phrase: 春眠暁ABC覚"""
         self.assertEqual(sorted(self.tokenizer.tokenize_for_words(u'春眠暁ABC覚')), sorted(['春', '眠', '暁', 'abc', '覚']))
 
     def test_tokenize_for_words_phrase_with_comma(self):
         """tokenizing phrase: 春眠暁, 暁"""
         phrase = u"春眠暁, 暁"
         self.assertEqual(sorted(self.tokenizer.tokenize_for_words(phrase)), sorted(['春','眠','暁']))
 
 
 TEST_SUITE = make_test_suite(TestAuthorTokenizerScanning,
                              TestAuthorTokenizerTokens,
                              TestExactAuthorTokenizer,
                              TestCJKTokenizer)
 
 
 if __name__ == '__main__':
     run_test_suite(TEST_SUITE)
diff --git a/invenio/modules/indexer/tokenizers/BibIndexAuthorCountTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexAuthorCountTokenizer.py
index 0cd14d179..09595476d 100644
--- a/invenio/modules/indexer/tokenizers/BibIndexAuthorCountTokenizer.py
+++ b/invenio/modules/indexer/tokenizers/BibIndexAuthorCountTokenizer.py
@@ -1,62 +1,62 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2010, 2011, 2012, 2014 CERN.
+# Copyright (C) 2010, 2011, 2012, 2014, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 """BibIndexAuthorCountTokenizer: counts number of authors for any publication
    given by recID. Will look at tags: '100_a' and '700_a' which are:
    'first author name' and 'additional author name'.
 """
 
 from invenio.legacy.bibindex.engine_utils import get_field_count
 from invenio.modules.indexer.tokenizers.BibIndexMultiFieldTokenizer import BibIndexMultiFieldTokenizer
 from invenio.modules.records.api import get_record
 
 
 class BibIndexAuthorCountTokenizer(BibIndexMultiFieldTokenizer):
     """
         Returns a number of authors who created a publication
         with given recID in the database.
 
         Takes recID of the record as an argument to tokenizing function.
         Calculates terms based on information from multiple tags.
         For more information on this type of tokenizers take a look on
         BibIndexAuthorCountTokenizer base class.
     """
 
     def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
         self.tags = ['100__a', '700__a']
         self.nonmarc_tag = 'number_of_authors'
 
 
     def tokenize(self, recID):
         """Uses get_field_count from bibindex.engine_utils
            for finding a number of authors of a publication and pass it in the list"""
         return [str(get_field_count(recID, self.tags)),]
 
     def tokenize_via_recjson(self, recID):
         """
         Will tokenize with use of bibfield.
         @param recID: id of the record
         """
         rec = get_record(recID)
         return [str(rec.get(self.nonmarc_tag) or 0)]
 
     def get_tokenizing_function(self, wordtable_type):
         return self.tokenize
 
     def get_nonmarc_tokenizing_function(self, table_type):
         return self.tokenize_via_recjson
diff --git a/invenio/modules/indexer/tokenizers/BibIndexAuthorTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexAuthorTokenizer.py
index b7011fb67..c990e42aa 100644
--- a/invenio/modules/indexer/tokenizers/BibIndexAuthorTokenizer.py
+++ b/invenio/modules/indexer/tokenizers/BibIndexAuthorTokenizer.py
@@ -1,336 +1,336 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2010, 2011, 2012 CERN.
+# Copyright (C) 2010, 2011, 2012, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 """BibIndexAuthorTokenizer: tokenizer introduced for author index.
    It tokenizes author name in a fuzzy way. Creates different variants of an author name.
    For example: John Cleese will be tokenized into: 'C John', 'Cleese John', 'John, C', 'John, Cleese'
 """
 
 
 import re
 
 from invenio.config import CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES
 from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import BibIndexDefaultTokenizer
 
 
 
 class BibIndexAuthorTokenizer(BibIndexDefaultTokenizer):
     """Human name tokenizer.
 
     Human names are divided into three classes of tokens:
     'lastnames', i.e., family, tribal or group identifiers,
     'nonlastnames', i.e., personal names distinguishing individuals,
     'titles', both incidental and permanent, e.g., 'VIII', '(ed.)', 'Msc'
     """
 
     def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
         BibIndexDefaultTokenizer.__init__(self, stemming_language,
                                                 remove_stopwords,
                                                 remove_html_markup,
                                                 remove_latex_markup)
         self.single_initial_re = re.compile('^\w\.$')
         self.split_on_re = re.compile('[\.\s-]')
         # lastname_stopwords describes terms which should not be used for indexing,
         # in multiple-word last names.  These are purely conjunctions, serving the
         # same function as the American hyphen, but using linguistic constructs.
         self.lastname_stopwords = set(['y', 'of', 'and', 'de'])
 
     def scan_string_for_phrases(self, s):
         """Scan a name string and output an object representing its structure.
 
         @param s: the input to be lexically tagged
         @type s: string
 
         @return: dict of lexically tagged input items.
 
             Sample output for the name 'Jingleheimer Schmitt, John Jacob, XVI.' is:
             {
                 'TOKEN_TAG_LIST' : ['lastnames', 'nonlastnames', 'titles', 'raw'],
                 'lastnames'      : ['Jingleheimer', 'Schmitt'],
                 'nonlastnames'   : ['John', 'Jacob'],
                 'titles'         : ['XVI.'],
                 'raw'            : 'Jingleheimer Schmitt, John Jacob, XVI.'
             }
         @rtype: dict
         """
         retval = {'TOKEN_TAG_LIST' : ['lastnames', 'nonlastnames', 'titles', 'raw'],
                   'lastnames'      : [],
                   'nonlastnames'   : [],
                   'titles'         : [],
                   'raw'            : s}
         l = s.split(',')
         if len(l) < 2:
             # No commas means a simple name
             new = s.strip()
             new = s.split(' ')
             if len(new) == 1:
                 retval['lastnames'] = new        # rare single-name case
             else:
                 retval['lastnames'] = new[-1:]
                 retval['nonlastnames'] = new[:-1]
                 for tag in ['lastnames', 'nonlastnames']:
                     retval[tag] = [x.strip() for x in retval[tag]]
                     retval[tag] = [re.split(self.split_on_re, x) for x in retval[tag]]
                         # flatten sublists
                     retval[tag] = [item for sublist in retval[tag] for item in sublist]
                     retval[tag] = [x for x in retval[tag] if x != '']
         else:
             # Handle lastname-first multiple-names case
             retval['titles'] = l[2:]             # no titles? no problem
             retval['nonlastnames'] = l[1]
             retval['lastnames'] = l[0]
             for tag in ['lastnames', 'nonlastnames']:
                 retval[tag] = retval[tag].strip()
                 retval[tag] = re.split(self.split_on_re, retval[tag])
                     # filter empty strings
                 retval[tag] = [x for x in retval[tag] if x != '']
             retval['titles'] = [x.strip() for x in retval['titles'] if x != '']
 
         return retval
 
     def parse_scanned_for_phrases(self, scanned):
         """Return all the indexable variations for a tagged token dictionary.
 
         Does this via the combinatoric expansion of the following rules:
         - Expands first names as name, first initial with period, first initial
             without period.
         - Expands compound last names as each of their non-stopword subparts.
         - Titles are treated literally, but applied serially.
 
         Please note that titles will be applied to complete last names only.
         So for example, if there is a compound last name of the form,
         "Ibanez y Gracia", with the title, "(ed.)", then only the combination
         of those two strings will do, not "Ibanez" and not "Gracia".
 
         @param scanned: lexically tagged input items in the form of the output
             from scan()
         @type scanned: dict
 
         @return: combinatorically expanded list of strings for indexing
         @rtype: list of string
         """
 
         def _fully_expanded_last_name(first, lastlist, title = None):
             """Return a list of all of the first / last / title combinations.
 
             @param first: one possible non-last name
             @type first: string
 
             @param lastlist: the strings of the tokens in the (possibly compound) last name
             @type lastlist: list of string
 
             @param title: one possible title
             @type title: string
             """
             retval = []
             title_word = ''
             if title != None:
                 title_word = ', ' + title
 
             last = ' '.join(lastlist)
             retval.append(first + ' ' + last + title_word)
             retval.append(last + ', ' + first + title_word)
             for last in lastlist:
                 if last in self.lastname_stopwords:
                     continue
                 retval.append(first + ' ' + last + title_word)
                 retval.append(last + ', ' + first + title_word)
 
             return retval
 
         last_parts = scanned['lastnames']
         first_parts = scanned['nonlastnames']
         titles = scanned['titles']
         raw = scanned['raw']
 
         if len(first_parts) == 0:                       # rare single-name case
             return scanned['lastnames']
 
         expanded = []
         for exp in self.__expand_nonlastnames(first_parts):
             expanded.extend(_fully_expanded_last_name(exp, last_parts, None))
             for title in titles:
                 # Drop titles which are parenthesized.  This eliminates (ed.) from the index, but
                 # leaves XI, for example.  This gets rid of the surprising behavior that searching
                 # for 'author:ed' retrieves people who have been editors, but whose names aren't
                 # Ed.
                 # TODO: Make editorship and other special statuses a MARC field.
                 if title.find('(') != -1:
                     continue
                 # XXX: remember to document that titles can only be applied to complete last names
                 expanded.extend(_fully_expanded_last_name(exp, [' '.join(last_parts)], title))
 
         return sorted(list(set(expanded)))
 
     def __expand_nonlastnames(self, namelist):
         """Generate every expansion of a series of human non-last names.
 
         Example:
         "Michael Edward" -> "Michael Edward", "Michael E.", "Michael E", "M. Edward", "M Edward",
                             "M. E.", "M. E", "M E.", "M E", "M.E."
                     ...but never:
                     "ME"
 
         @param namelist: a collection of names
         @type namelist: list of string
 
         @return: a greatly expanded collection of names
         @rtype: list of string
         """
 
         def _expand_name(name):
             """Lists [name, initial, empty]"""
             if name == None:
                 return []
             return [name, name[0]]
 
         def _pair_items(head, tail):
             """Lists every combination of head with each and all of tail"""
             if len(tail) == 0:
                 return [head]
             l = []
             l.extend([head + ' ' + tail[0]])
             #l.extend([head + '-' + tail[0]])
             l.extend(_pair_items(head, tail[1:]))
             return l
 
         def _collect(head, tail):
             """Brings together combinations of things"""
 
             def _cons(a, l):
                 l2 = l[:]
                 l2.insert(0, a)
                 return l2
 
             if len(tail) == 0:
                 return [head]
             l = []
             l.extend(_pair_items(head, _expand_name(tail[0])))
             l.extend([' '.join(_cons(head, tail)).strip()])
             #l.extend(['-'.join(_cons(head, tail)).strip()])
             l.extend(_collect(head, tail[1:]))
             return l
 
         def _expand_contract(namelist):
             """Runs collect with every head in namelist and its tail"""
             val = []
             for i  in range(len(namelist)):
                 name = namelist[i]
                 for expansion in _expand_name(name):
                     val.extend(_collect(expansion, namelist[i+1:]))
             return val
 
         def _add_squashed(namelist):
             """Finds cases like 'M. E.' and adds 'M.E.'"""
             val = namelist
 
             def __check_parts(parts):
                 if len(parts) < 2:
                     return False
                 for part in parts:
                     if not self.single_initial_re.match(part):
                         return False
                 return True
 
             for name in namelist:
                 parts = name.split(' ')
                 if not __check_parts(parts):
                     continue
                 val.extend([''.join(parts)])
 
             return val
 
         return _add_squashed(_expand_contract(namelist))
 
 
     def tokenize_for_fuzzy_authors(self, phrase):
         """Output the list of strings expanding phrase.
 
         Does this via the combinatoric expansion of the following rules:
         - Expands first names as name, first initial with period, first initial
             without period.
         - Expands compound last names as each of their non-stopword subparts.
         - Titles are treated literally, but applied serially.
 
         Please note that titles will be applied to complete last names only.
         So for example, if there is a compound last name of the form,
         "Ibanez y Gracia", with the title, "(ed.)", then only the combination
         of those two strings will do, not "Ibanez" and not "Gracia".
 
         Old: BibIndexFuzzyAuthorTokenizer
 
         @param phrase: the input to be lexically tagged
         @type phrase: string
 
         @return: combinatorically expanded list of strings for indexing
         @rtype: list of string
 
         @note: A simple wrapper around scan and parse_scanned.
         """
         return self.parse_scanned_for_phrases(self.scan_string_for_phrases(phrase))
 
 
     def tokenize_for_phrases(self, phrase):
         """
             Another name for tokenize_for_fuzzy_authors.
             It's for the compatibility.
             See: tokenize_for_fuzzy_authors
         """
         return self.tokenize_for_fuzzy_authors(phrase)
 
 
     def tokenize_for_words_default(self, phrase):
         """Default tokenize_for_words inherited from default tokenizer"""
         return super(BibIndexAuthorTokenizer, self).tokenize_for_words(phrase)
 
 
     def get_author_family_name_words_from_phrase(self, phrase):
         """ Return list of words from author family names, not his/her first names.
 
         The phrase is assumed to be the full author name.  This is
         useful for CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES.
 
         @param phrase: phrase to get family name from
         """
         d_family_names = {}
         # first, treat everything before first comma as surname:
         if ',' in phrase:
             d_family_names[phrase.split(',', 1)[0]] = 1
         # second, try fuzzy author tokenizer to find surname variants:
         for name in self.tokenize_for_phrases(phrase):
             if ',' in name:
                 d_family_names[name.split(',', 1)[0]] = 1
         # now extract words from these surnames:
         d_family_names_words = {}
         for family_name in d_family_names.keys():
             for word in self.tokenize_for_words_default(family_name):
                 d_family_names_words[word] = 1
         return d_family_names_words.keys()
 
 
     def tokenize_for_words(self, phrase):
         """
             If CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES is 1 we tokenize only for family names.
             In other case we perform standard tokenization for words.
         """
         if CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES:
             return self.get_author_family_name_words_from_phrase(phrase)
         else:
             return self.tokenize_for_words_default(phrase)
 
 
diff --git a/invenio/modules/indexer/tokenizers/BibIndexCJKTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexCJKTokenizer.py
index c3908cdff..79c3a8c37 100644
--- a/invenio/modules/indexer/tokenizers/BibIndexCJKTokenizer.py
+++ b/invenio/modules/indexer/tokenizers/BibIndexCJKTokenizer.py
@@ -1,133 +1,133 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2010, 2011, 2012 CERN.
+# Copyright (C) 2010, 2011, 2012, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 """BibIndexCJKTokenizer: makes search in collections with CJK papers and publications more reliable
    If phrase has characters from CJK language set tokenizer will treat it diffrently than phrase without these chars.
    CJK Tokenizer splits CJK words into single characters (it adds space between every two CJK characters).
 """
 
 import re
 
 from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import BibIndexDefaultTokenizer
 
 is_character_from_CJK_set = re.compile(u'[\u3400-\u4DBF\u4E00-\u9FFF]')
 special_CJK_punctuation = re.compile(u'[\uff1a,\uff0c,\u3001,\u3002,\u201c,\u201d]')
 
 
 def is_from_CJK_set_single_character_match(char):
     if not isinstance(char, unicode):
         char = char.decode("utf8")
     res = is_character_from_CJK_set.match(char)
     if res:
         return True
     return False
 
 
 def is_from_CJK_set_full_match(text):
     if not isinstance(text, unicode):
         text = text.decode("utf8")
     res = is_character_from_CJK_set.findall(text)
     if len(res) == len(text):
         return True
     return False
 
 
 def is_there_any_CJK_character_in_text(text):
     if not isinstance(text, unicode):
         text = text.decode("utf8")
     res = is_character_from_CJK_set.search(text)
     if res is not None:
         return True
     return False
 
 
 def is_non_CJK_expression(word):
     return not is_there_any_CJK_character_in_text(word)
 
 
 class BibIndexCJKTokenizer(BibIndexDefaultTokenizer):
     """A phrase is split into CJK characters.
        CJK is Chinese, Japanese and Korean unified character set.
        It means that for example, phrase: '据信，新手机更轻'
        will be split into: ['据', '信', '新', '手', '机', '更', '轻']"""
 
     def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
         """Initialisation"""
         BibIndexDefaultTokenizer.__init__(self, stemming_language,
                                                 remove_stopwords,
                                                 remove_html_markup,
                                                 remove_latex_markup)
 
 
     def tokenize_for_words_default(self, phrase):
         """Default tokenize_for_words inherited from default tokenizer"""
         return super(BibIndexCJKTokenizer, self).tokenize_for_words(phrase)
 
 
     def tokenize_for_words(self, phrase):
         """
         Splits phrase into words with additional spaces
         between CJK characters to enhance search for CJK papers and stuff.
         If there is no single CJK character in whole phrase it behaves the standard way:
         it splits phrase into words with use of BibIndexDefaultTokenizer's tokenize_for_words.
 
         @param phrase: CJK phrase to be tokenized
         @type phrase: string
 
         @return: list of CJK characters and non-CJK words
         @rtype: list of string
         """
         if is_there_any_CJK_character_in_text(phrase):
             #remove special CJK punctuation
             phrase = special_CJK_punctuation.sub("", phrase)
             #first, we split our phrase with default word tokenizer to make it easier later
             pre_tokenized = self.tokenize_for_words_default(phrase)
             #list for keeping CJK chars and non-CJK words
             chars = []
             #every CJK word splits into a set of single characters
             #for example: "春眠暁覚" into ['春','眠','暁','覚']
             words = [ word.decode("utf8") for word in pre_tokenized]
             for word in words:
                 if is_from_CJK_set_full_match(word):
                     chars.extend(word)
                 else:
                     non_chinese = u""
                     for char in word:
                         if is_from_CJK_set_single_character_match(char):
                             if non_chinese:
                                 chars.append(non_chinese)
                                 non_chinese = u""
                             chars.append(char)
                         else:
                             non_chinese = non_chinese + char
                     if non_chinese:
                         chars.append(non_chinese)
             clean_dict = {}
             for c in chars:
                 clean_dict[c] = 1
             chars = [c.encode("utf8") for c in clean_dict.keys()]
             return chars
         else:
             return self.tokenize_for_words_default(phrase)
 
 
     def tokenize_for_pairs(self, phrase):
         return []
 
     def tokenize_for_phrases(self, phrase):
         return []
diff --git a/invenio/modules/indexer/tokenizers/BibIndexDOITokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexDOITokenizer.py
index 3344d9b48..b939e69ec 100644
--- a/invenio/modules/indexer/tokenizers/BibIndexDOITokenizer.py
+++ b/invenio/modules/indexer/tokenizers/BibIndexDOITokenizer.py
@@ -1,53 +1,53 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2013 CERN.
+# Copyright (C) 2013, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 from invenio.modules.indexer.tokenizers.BibIndexFilteringTokenizer import BibIndexFilteringTokenizer
 
 
 class BibIndexDOITokenizer(BibIndexFilteringTokenizer):
     """
         Filtering tokenizer which tokenizes DOI tag (0247_a)
         only if "0247_2" tag is present and its value equals "DOI"
         and 909C4a tag without any constraints.
     """
 
 
     def __init__(self, stemming_language=None, remove_stopwords=False, remove_html_markup=False, remove_latex_markup=False):
         self.rules = (('0247_a', '2', 'DOI'), ('909C4a', '', ''))
 
 
     def get_tokenizing_function(self, wordtable_type):
         """Returns proper tokenizing function"""
         return self.tokenize
 
     def tokenize_via_recjson(self, recID):
         """
         Nonmarc version of tokenize function for DOI.
         Note: with nonmarc we don't need to filter anymore.
         We just need to take value from record because we
         use bibfield here.
         """
         rec = get_record(recID)
         values = rec.get('doi', [])
         return values
 
     def get_nonmarc_tokenizing_function(self, table_type):
         """
         Returns proper tokenizing function for non-marc records.
         """
         return self.tokenize_via_recjson
diff --git a/invenio/modules/indexer/tokenizers/BibIndexDefaultTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexDefaultTokenizer.py
index cc8bdae84..530f0b93e 100644
--- a/invenio/modules/indexer/tokenizers/BibIndexDefaultTokenizer.py
+++ b/invenio/modules/indexer/tokenizers/BibIndexDefaultTokenizer.py
@@ -1,173 +1,173 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2010, 2011, 2012, 2014 CERN.
+# Copyright (C) 2010, 2011, 2012, 2014, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 """BibIndexDefaultTokenizer: useful for almost all indexes.
    It performs standard tokenization. It splits phrases into words/pairs or doesnt split at all, strips accents,
    removes alphanumeric characters and html and latex markup if we want to. Also can stem words for a given language.
 """
 
 from invenio.legacy.bibindex.engine_config import \
      CFG_BIBINDEX_INDEX_TABLE_TYPE
 from invenio.utils.html import remove_html_markup
 from invenio.utils.text import wash_for_utf8, strip_accents
 from invenio.legacy.bibindex.engine_washer import \
      lower_index_term, remove_latex_markup, \
      apply_stemming, remove_stopwords, length_check
 from invenio.legacy.bibindex.engine_utils import latex_formula_re, \
      re_block_punctuation_begin, \
      re_block_punctuation_end, \
      re_punctuation, \
      re_separators, \
      re_arxiv
 from invenio.modules.indexer.tokenizers.BibIndexStringTokenizer import BibIndexStringTokenizer
 
 
 
 class BibIndexDefaultTokenizer(BibIndexStringTokenizer):
     """
         It's a standard tokenizer. It is useful for most of the indexes.
         Its behaviour depends on stemming, remove stopwords, remove html markup and remove latex markup parameters.
     """
 
     def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
         """initialization"""
         self.stemming_language = stemming_language
         self.remove_stopwords = remove_stopwords
         self.remove_html_markup = remove_html_markup
         self.remove_latex_markup = remove_latex_markup
 
 
     def get_tokenizing_function(self, wordtable_type):
         """Picks correct tokenize_for_xxx function depending on type of tokenization (wordtable_type)"""
         if wordtable_type == CFG_BIBINDEX_INDEX_TABLE_TYPE["Words"]:
             return self.tokenize_for_words
         elif wordtable_type == CFG_BIBINDEX_INDEX_TABLE_TYPE["Pairs"]:
             return self.tokenize_for_pairs
         elif wordtable_type == CFG_BIBINDEX_INDEX_TABLE_TYPE["Phrases"]:
             return self.tokenize_for_phrases
 
 
 
     def tokenize_for_words(self, phrase):
         """Return list of words found in PHRASE.  Note that the phrase is
            split into groups depending on the alphanumeric characters and
            punctuation characters definition present in the config file.
         """
 
         words = {}
         formulas = []
         if self.remove_html_markup and phrase.find("</") > -1:
             phrase = remove_html_markup(phrase)
         if self.remove_latex_markup:
             formulas = latex_formula_re.findall(phrase)
             phrase = remove_latex_markup(phrase)
             phrase = latex_formula_re.sub(' ', phrase)
         phrase = wash_for_utf8(phrase)
         phrase = lower_index_term(phrase)
         # 1st split phrase into blocks according to whitespace
         for block in strip_accents(phrase).split():
             # 2nd remove leading/trailing punctuation and add block:
             block = re_block_punctuation_begin.sub("", block)
             block = re_block_punctuation_end.sub("", block)
             if block:
                 stemmed_block = remove_stopwords(block, self.remove_stopwords)
                 stemmed_block = length_check(stemmed_block)
                 stemmed_block = apply_stemming(stemmed_block, self.stemming_language)
                 if stemmed_block:
                     words[stemmed_block] = 1
                 if re_arxiv.match(block):
                     # special case for blocks like `arXiv:1007.5048' where
                     # we would like to index the part after the colon
                     # regardless of dot or other punctuation characters:
                     words[block.split(':', 1)[1]] = 1
                 # 3rd break each block into subblocks according to punctuation and add subblocks:
                 for subblock in re_punctuation.split(block):
                     stemmed_subblock = remove_stopwords(subblock, self.remove_stopwords)
                     stemmed_subblock = length_check(stemmed_subblock)
                     stemmed_subblock = apply_stemming(stemmed_subblock, self.stemming_language)
                     if stemmed_subblock:
                         words[stemmed_subblock] = 1
                     # 4th break each subblock into alphanumeric groups and add groups:
                     for alphanumeric_group in re_separators.split(subblock):
                         stemmed_alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords)
                         stemmed_alphanumeric_group = length_check(stemmed_alphanumeric_group)
                         stemmed_alphanumeric_group = apply_stemming(stemmed_alphanumeric_group, self.stemming_language)
                         if stemmed_alphanumeric_group:
                             words[stemmed_alphanumeric_group] = 1
         for block in formulas:
             words[block] = 1
         return words.keys()
 
 
     def tokenize_for_pairs(self, phrase):
         """Return list of words found in PHRASE.  Note that the phrase is
            split into groups depending on the alphanumeric characters and
            punctuation characters definition present in the config file.
         """
 
         words = {}
         if self.remove_html_markup and phrase.find("</") > -1:
             phrase = remove_html_markup(phrase)
         if self.remove_latex_markup:
             phrase = remove_latex_markup(phrase)
             phrase = latex_formula_re.sub(' ', phrase)
         phrase = wash_for_utf8(phrase)
         phrase = lower_index_term(phrase)
         # 1st split phrase into blocks according to whitespace
         last_word = ''
         for block in strip_accents(phrase).split():
             # 2nd remove leading/trailing punctuation and add block:
             block = re_block_punctuation_begin.sub("", block)
             block = re_block_punctuation_end.sub("", block)
             if block:
                 block = remove_stopwords(block, self.remove_stopwords)
                 block = length_check(block)
                 block = apply_stemming(block, self.stemming_language)
                 # 3rd break each block into subblocks according to punctuation and add subblocks:
                 for subblock in re_punctuation.split(block):
                     subblock = remove_stopwords(subblock, self.remove_stopwords)
                     subblock = length_check(subblock)
                     subblock = apply_stemming(subblock, self.stemming_language)
                     if subblock:
                         # 4th break each subblock into alphanumeric groups and add groups:
                         for alphanumeric_group in re_separators.split(subblock):
                             alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords)
                             alphanumeric_group = length_check(alphanumeric_group)
                             alphanumeric_group = apply_stemming(alphanumeric_group, self.stemming_language)
                             if alphanumeric_group:
                                 if last_word:
                                     words['%s %s' % (last_word, alphanumeric_group)] = 1
                                 last_word = alphanumeric_group
         return words.keys()
 
 
     def tokenize_for_phrases(self, phrase):
         """Return list of phrases found in PHRASE.  Note that the phrase is
            split into groups depending on the alphanumeric characters and
            punctuation characters definition present in the config file.
         """
         phrase = wash_for_utf8(phrase)
         return [phrase]
 
     def get_nonmarc_tokenizing_function(self, table_type):
         """
         Picks correct tokenize_for_xxx function
         depending on the type of tokenization
         for non-marc standards.
         """
         return self.get_tokenizing_function(table_type)
diff --git a/invenio/modules/indexer/tokenizers/BibIndexEmptyTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexEmptyTokenizer.py
index 4e87d3c50..1508a6bab 100644
--- a/invenio/modules/indexer/tokenizers/BibIndexEmptyTokenizer.py
+++ b/invenio/modules/indexer/tokenizers/BibIndexEmptyTokenizer.py
@@ -1,73 +1,73 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2010, 2011, 2012, 2014 CERN.
+# Copyright (C) 2010, 2011, 2012, 2014, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 """
     BibIndexEmptyTokenizer.
 
     It's a really lazy tokenizer and doesn't do anything.
 """
 
 from invenio.legacy.bibindex.engine_config import CFG_BIBINDEX_INDEX_TABLE_TYPE
 from invenio.modules.indexer.tokenizers.BibIndexStringTokenizer import BibIndexStringTokenizer
 
 
 class BibIndexEmptyTokenizer(BibIndexStringTokenizer):
     """
        BibIndexEmptyTokenizer doesn't do anything.
        Irrespective of input to tokenizing function it
        always returns empty list.
 
        Can be used in some default cases or when we want to
        turn off specific index.
     """
 
     def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
         """@param stemming_language: dummy
            @param remove_stopwords: dummy
            @param remove_html_markup: dummy
            @param remove_latex_markup: dummy
         """
         pass
 
 
     def get_tokenizing_function(self, wordtable_type):
         """Picks correct tokenize_for_xxx function depending on type of tokenization (wordtable_type)"""
         if wordtable_type == CFG_BIBINDEX_INDEX_TABLE_TYPE["Words"]:
             return self.tokenize_for_words
         elif wordtable_type == CFG_BIBINDEX_INDEX_TABLE_TYPE["Pairs"]:
             return self.tokenize_for_pairs
         elif wordtable_type == CFG_BIBINDEX_INDEX_TABLE_TYPE["Phrases"]:
             return self.tokenize_for_phrases
 
     def get_nonmarc_tokenizing_function(self, table_type):
         """
         Picks correct tokenize_for_xxx function
         depending on the type of tokenization
         for non-marc standards.
         """
         return self.get_tokenizing_function(table_type)
 
     def tokenize_for_words(self, phrase):
         return []
 
     def tokenize_for_pairs(self, phrase):
         return []
 
     def tokenize_for_phrases(self, phrase):
         return []
 
diff --git a/invenio/modules/indexer/tokenizers/BibIndexExactAuthorTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexExactAuthorTokenizer.py
index e111b5292..2c7f0a851 100644
--- a/invenio/modules/indexer/tokenizers/BibIndexExactAuthorTokenizer.py
+++ b/invenio/modules/indexer/tokenizers/BibIndexExactAuthorTokenizer.py
@@ -1,44 +1,44 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2010, 2011, 2012 CERN.
+# Copyright (C) 2010, 2011, 2012, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 """BibIndexExactAuthorTokenizer: performs only washing on author name and leaves it alone
    in the same form. 
 """
 
 from invenio.legacy.bibindex.engine_washer import wash_author_name
 from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import BibIndexDefaultTokenizer
 
 
 
 
 class BibIndexExactAuthorTokenizer(BibIndexDefaultTokenizer):
     """
     Human name exact tokenizer.
     Old: BibIndexExactNameTokenizer
     """
     def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
         BibIndexDefaultTokenizer.__init__(self, stemming_language,
                                                 remove_stopwords,
                                                 remove_html_markup,
                                                 remove_latex_markup)
 
     def tokenize_for_phrases(self, s):
         """
         Returns washed autor name.
         """
         return [wash_author_name(s)]
diff --git a/invenio/modules/indexer/tokenizers/BibIndexFilenameTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexFilenameTokenizer.py
index 313a95a87..9aa52d3c4 100644
--- a/invenio/modules/indexer/tokenizers/BibIndexFilenameTokenizer.py
+++ b/invenio/modules/indexer/tokenizers/BibIndexFilenameTokenizer.py
@@ -1,69 +1,69 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2010, 2011, 2012, 2013 CERN.
+# Copyright (C) 2010, 2011, 2012, 2013, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 """BibIndexFilenameTokenizer: 'tokenizes' finds file names.
    Tokenizer is adapted to work with bibfield and its get_record function.
 """
 
 
 from invenio.modules.indexer.tokenizers.BibIndexRecJsonTokenizer import BibIndexRecJsonTokenizer
 
 
 class BibIndexFilenameTokenizer(BibIndexRecJsonTokenizer):
     """
         Tokenizes for file names.
         Tokenizer is adapted to work with bibfield and its get_record function.
 
         It accepts as an input a record created by a get_record function:
 
         from bibfield import get_record
         record16 = get_record(16)
         tokenizer = BibIndexFilenameTokenizer()
         new_words = tokenizer.tokenize(record16)
 
         Example of new_words:
         'thesis.ps.gz' -> ['thesis', 'thesis.ps', 'thesis.ps.gz']
     """
 
     def __init__(self, stemming_language = None,
                        remove_stopwords = False,
                        remove_html_markup = False,
                        remove_latex_markup = False):
         pass
 
 
     def tokenize(self, record):
         """'record' is a recjson record from bibfield.
 
            Function uses derived field 'filenames'
            from the record.
 
            @param urls: recjson record
         """
         values = []
         try:
             if 'filenames' in record:
                 values = record['filenames']
         except KeyError:
             pass
         except TypeError:
             return []
         return values
 
     def get_tokenizing_function(self, wordtable_type):
         return self.tokenize
diff --git a/invenio/modules/indexer/tokenizers/BibIndexFiletypeTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexFiletypeTokenizer.py
index c5b17b7ad..388d3ddd8 100644
--- a/invenio/modules/indexer/tokenizers/BibIndexFiletypeTokenizer.py
+++ b/invenio/modules/indexer/tokenizers/BibIndexFiletypeTokenizer.py
@@ -1,71 +1,71 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2010, 2011, 2012, 2013, 2014 CERN.
+# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 """BibIndexFiletypeTokenizer: 'tokenizes' for file extensions.
    Tokenizer is adapted to work with recjson and its get_record function.
 """
 
 from invenio.modules.indexer.tokenizers.BibIndexRecJsonTokenizer import BibIndexRecJsonTokenizer
 
 
 class BibIndexFiletypeTokenizer(BibIndexRecJsonTokenizer):
     """
         Tokenizes for file extensions.
         Tokenizer is adapted to work with recjson and its get_record function.
 
         It accepts as an input a record created by a get_record function:
 
         from invenio.modules.records.api import get_record
         record16 = get_record(16)
         tokenizer = BibIndexFiletypeTokenizer()
         new_words = tokenizer.tokenize(record16)
     """
 
     def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
         pass
 
 
     def tokenize(self, record):
         """'record' is a recjson record.
 
            Function uses derived field 'filetypes'
            from the record.
 
            @param urls: recjson record
         """
         values = []
         try:
             if 'filetypes' in record:
                 values = record['filetypes']
         except KeyError:
             pass
         except TypeError:
             return []
         return values
 
     def tokenize_for_words(self, record):
         return self.tokenize(record)
 
     def tokenize_for_pairs(self, record):
         return self.tokenize(record)
 
     def tokenize_for_phrases(self, record):
         return self.tokenize(record)
 
     def get_tokenizing_function(self, wordtable_type):
         return self.tokenize
diff --git a/invenio/modules/indexer/tokenizers/BibIndexFilteringTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexFilteringTokenizer.py
index 1e7afebe0..18704c32c 100644
--- a/invenio/modules/indexer/tokenizers/BibIndexFilteringTokenizer.py
+++ b/invenio/modules/indexer/tokenizers/BibIndexFilteringTokenizer.py
@@ -1,85 +1,85 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2013, 2014 CERN.
+# Copyright (C) 2013, 2014, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 from invenio.legacy.search_engine import get_record
 from invenio.legacy.bibrecord import record_get_field_instances
 from invenio.modules.indexer.tokenizers.BibIndexMultiFieldTokenizer import BibIndexMultiFieldTokenizer
 
 class BibIndexFilteringTokenizer(BibIndexMultiFieldTokenizer):
     """
         This tokenizer would tokenize phrases from tag
         only if another tag was present in the record's metadata,
         for example it would tokenize phrases from 100__a
         only if 100__u was found in the record's metadata.
 
         This tokenizer is abstract and it shouldn't be used
         for indexes. Insted of using this tokenizer one can
         create another tokenizer iheriting after this one.
 
         To create new tokenizer based on BibIndexFilteringTokenizer
         you need to specify rules of tokenizing in self.rules
         property.
 
         Examples:
         1) Let's say we want to tokenize data only from 100__a if 100__u is present:
             set: self.rules = (('100__a', 'u', ''),)
         2) We want to tokenize data from '0247_a' if '0247_2' == 'DOI':
             set: self.rules = (('0247_2', '2', 'DOI'),)
         3) We want to tokenize data from '0247_a' if '0247_2' == 'DOI' and all data
            from '100__a' with no constraints:
            set: self.rules = (('0247_2', '2', 'DOI'), ('100__a', '', ''))
 
         Definition of 'rules' tuple:
         (tag_to_take_phrases_from, value_of_sub_tag or '', necessary_value_of_sub_tag or '')
 
         Note: there is no get_tokenizing_function() to make this tokenizer abstract.
     """
 
     def __init__(self, stemming_language=None, remove_stopwords=False, remove_html_markup=False, remove_latex_markup=False):
         self.rules = ()
 
 
     def tokenize(self, recID):
         phrases = []
         try:
             rec = get_record(recID)
 
             for rule in self.rules:
                 tag_to_index, necessary_tag, necessary_value = rule
                 core_tag = tag_to_index[0:3]
                 ind = tag_to_index[3:5]
                 sub_tag = tag_to_index[5]
 
                 fields = [dict(instance[0]) for instance in record_get_field_instances(rec, core_tag, ind[0], ind[1])]
                 for field in fields:
                     tag_condition = necessary_tag and field.has_key(necessary_tag) or necessary_tag == ''
                     value_condition = necessary_value and field.get(necessary_tag, '') == necessary_value or \
                                       necessary_value == ''
                     if tag_condition and field.has_key(sub_tag) and value_condition:
                         phrases.append(field[sub_tag])
             return phrases
         except KeyError:
             return []
         return phrases
 
     def tokenize_via_recjson(self, recID):
         """
         TODO: implementation needs to be introduced
         in order to work with non-marc standards.
         """
         return []
diff --git a/invenio/modules/indexer/tokenizers/BibIndexFulltextTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexFulltextTokenizer.py
index 2159e3163..86ff9526d 100644
--- a/invenio/modules/indexer/tokenizers/BibIndexFulltextTokenizer.py
+++ b/invenio/modules/indexer/tokenizers/BibIndexFulltextTokenizer.py
@@ -1,194 +1,194 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2010, 2011, 2012, 2014 CERN.
+# Copyright (C) 2010, 2011, 2012, 2014, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 """BibIndexFulltextTokenizer: extracts words form a given document.
    Document is given by its URL.
 """
 
 import os
 import sys
 import logging
 import urllib2
 import re
 
 from six import iteritems
 
 from invenio.config import \
      CFG_SOLR_URL, \
      CFG_XAPIAN_ENABLED, \
      CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY, \
      CFG_BIBINDEX_SPLASH_PAGES
 from invenio.utils.html import get_links_in_html_page
 from invenio.legacy.websubmit.file_converter import convert_file, get_file_converter_logger
 from invenio.legacy.miscutil.solrutils_bibindex_indexer import solr_add_fulltext
 from invenio.legacy.miscutil.xapianutils_bibindex_indexer import xapian_add
 from invenio.legacy.bibdocfile.api import bibdocfile_url_p, \
      bibdocfile_url_to_bibdoc, download_url, \
      BibRecDocs, InvenioBibDocFileError
 from invenio.legacy.bibindex.engine_utils import get_idx_indexer
 from invenio.legacy.bibsched.bibtask import write_message
 from invenio.ext.logging import register_exception
 from intbitset import intbitset
 from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import BibIndexDefaultTokenizer
 
 fulltext_added = intbitset() # stores ids of records whose fulltexts have been added
 
 
 class BibIndexFulltextTokenizer(BibIndexDefaultTokenizer):
     """
         Exctracts all the words contained in document specified by url.
     """
 
     def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
         self.verbose = 3
         BibIndexDefaultTokenizer.__init__(self, stemming_language,
                                                 remove_stopwords,
                                                 remove_html_markup,
                                                 remove_latex_markup)
 
     def set_verbose(self, verbose):
         """Allows to change verbosity level during indexing"""
         self.verbose = verbose
 
     def tokenize_for_words_default(self, phrase):
         """Default tokenize_for_words inherited from default tokenizer"""
         return super(BibIndexFulltextTokenizer, self).tokenize_for_words(phrase)
 
 
     def get_words_from_fulltext(self, url_direct_or_indirect):
         """Returns all the words contained in the document specified by
            URL_DIRECT_OR_INDIRECT with the words being split by various
            SRE_SEPARATORS regexp set earlier.  If FORCE_FILE_EXTENSION is
            set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF
            file.  (This is interesting to index Indico for example.)  Note
            also that URL_DIRECT_OR_INDIRECT may be either a direct URL to
            the fulltext file or an URL to a setlink-like page body that
            presents the links to be indexed.  In the latter case the
            URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs
            to fulltext documents, for all knows file extensions as
            specified by global CONV_PROGRAMS config variable.
         """
         write_message("... reading fulltext files from %s started" % url_direct_or_indirect, verbose=2)
         try:
             if bibdocfile_url_p(url_direct_or_indirect):
                 write_message("... %s is an internal document" % url_direct_or_indirect, verbose=2)
                 try:
                     bibdoc = bibdocfile_url_to_bibdoc(url_direct_or_indirect)
                 except InvenioBibDocFileError:
                     # Outdated 8564 tag
                     return []
                 indexer = get_idx_indexer('fulltext')
                 if indexer != 'native':
                     # A document might belong to multiple records
                     for rec_link in bibdoc.bibrec_links:
                         recid = rec_link["recid"]
                         # Adds fulltexts of all files once per records
                         if not recid in fulltext_added:
                             bibrecdocs = BibRecDocs(recid)
                             try:
                                 text = bibrecdocs.get_text()
                             except InvenioBibDocFileError:
                                 # Invalid PDF
                                 continue
                             if indexer == 'SOLR' and CFG_SOLR_URL:
                                 solr_add_fulltext(recid, text)
                             elif indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED:
                                 xapian_add(recid, 'fulltext', text)
 
                         fulltext_added.add(recid)
                     # we are relying on an external information retrieval system
                     # to provide full-text indexing, so dispatch text to it and
                     # return nothing here:
                     return []
                 else:
                     text = ""
                     if hasattr(bibdoc, "get_text"):
                         text = bibdoc.get_text()
                     return self.tokenize_for_words_default(text)
             else:
                 if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY:
                     write_message("... %s is external URL but indexing only local files" % url_direct_or_indirect, verbose=2)
                     return []
                 write_message("... %s is an external URL" % url_direct_or_indirect, verbose=2)
                 urls_to_index = set()
                 for splash_re, url_re in iteritems(CFG_BIBINDEX_SPLASH_PAGES):
                     if re.match(splash_re, url_direct_or_indirect):
                         write_message("... %s is a splash page (%s)" % (url_direct_or_indirect, splash_re), verbose=2)
                         html = urllib2.urlopen(url_direct_or_indirect).read()
                         urls = get_links_in_html_page(html)
                         write_message("... found these URLs in %s splash page: %s" % (url_direct_or_indirect, ", ".join(urls)), verbose=3)
                         for url in urls:
                             if re.match(url_re, url):
                                 write_message("... will index %s (matched by %s)" % (url, url_re), verbose=2)
                                 urls_to_index.add(url)
                 if not urls_to_index:
                     urls_to_index.add(url_direct_or_indirect)
                 write_message("... will extract words from %s" % ', '.join(urls_to_index), verbose=2)
                 words = {}
                 for url in urls_to_index:
                     tmpdoc = download_url(url)
                     file_converter_logger = get_file_converter_logger()
                     old_logging_level = file_converter_logger.getEffectiveLevel()
                     if self.verbose > 3:
                         file_converter_logger.setLevel(logging.DEBUG)
                     try:
                         try:
                             tmptext = convert_file(tmpdoc, output_format='.txt')
                             text = open(tmptext).read()
                             os.remove(tmptext)
 
                             indexer = get_idx_indexer('fulltext')
                             if indexer != 'native':
                                 if indexer == 'SOLR' and CFG_SOLR_URL:
                                     solr_add_fulltext(None, text) # FIXME: use real record ID
                                 if indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED:
                                     #xapian_add(None, 'fulltext', text) # FIXME: use real record ID
                                     pass
                                 # we are relying on an external information retrieval system
                                 # to provide full-text indexing, so dispatch text to it and
                                 # return nothing here:
                                 tmpwords = []
                             else:
                                 tmpwords = self.tokenize_for_words_default(text)
                             words.update(dict(map(lambda x: (x, 1), tmpwords)))
                         except Exception as e:
                             message = 'ERROR: it\'s impossible to correctly extract words from %s referenced by %s: %s' % (url, url_direct_or_indirect, e)
                             register_exception(prefix=message, alert_admin=True)
                             write_message(message, stream=sys.stderr)
                     finally:
                         os.remove(tmpdoc)
                         if self.verbose > 3:
                             file_converter_logger.setLevel(old_logging_level)
                 return words.keys()
         except Exception as e:
             message = 'ERROR: it\'s impossible to correctly extract words from %s: %s' % (url_direct_or_indirect, e)
             register_exception(prefix=message, alert_admin=True)
             write_message(message, stream=sys.stderr)
             return []
 
 
     def tokenize_for_words(self, phrase):
         return self.get_words_from_fulltext(phrase)
 
 
     def tokenize_for_pairs(self, phrase):
         return []
 
     def tokenize_for_phrases(self, phrase):
         return []
 
diff --git a/invenio/modules/indexer/tokenizers/BibIndexItemCountTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexItemCountTokenizer.py
index 9420450fe..61db49611 100644
--- a/invenio/modules/indexer/tokenizers/BibIndexItemCountTokenizer.py
+++ b/invenio/modules/indexer/tokenizers/BibIndexItemCountTokenizer.py
@@ -1,56 +1,56 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2010, 2011, 2012, 2014 CERN.
+# Copyright (C) 2010, 2011, 2012, 2014, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 """BibIndexItemCountTokenizer: counts the number of copies of a book which is
    owned by the library in the real world.
 """
 
 from invenio.modules.indexer.tokenizers.BibIndexRecJsonTokenizer import BibIndexRecJsonTokenizer
 
 
 class BibIndexItemCountTokenizer(BibIndexRecJsonTokenizer):
     """
         Returns a number of copies of a book which is owned by the library.
     """
 
     def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
         pass
 
 
     def tokenize(self, record):
         """Tokenizes for number of copies of a book in the 'real' library"""
         count = 0
         try:
             count = record['number_of_copies']
         except KeyError:
             pass
         except TypeError:
             return []
         return [str(count)]
 
     def tokenize_for_words(self, record):
         return self.tokenize(record)
 
     def tokenize_for_pairs(self, record):
         return self.tokenize(record)
 
     def tokenize_for_phrases(self, record):
         return self.tokenize(record)
 
     def get_tokenizing_function(self, wordtable_type):
         return self.tokenize
diff --git a/invenio/modules/indexer/tokenizers/BibIndexJournalTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexJournalTokenizer.py
index b6e1b1059..a61d4546c 100644
--- a/invenio/modules/indexer/tokenizers/BibIndexJournalTokenizer.py
+++ b/invenio/modules/indexer/tokenizers/BibIndexJournalTokenizer.py
@@ -1,141 +1,141 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2010, 2011, 2012, 2014 CERN.
+# Copyright (C) 2010, 2011, 2012, 2014, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 """BibIndexJournalTokenizer: useful for journal index.
    Agregates info about journal in a specific way given by its variable
    journal_pubinfo_standard_form.
    Behaves in the same way for all index table types:
    - Words
    - Pairs
    - Phrases
 """
 
 from invenio.legacy.dbquery import run_sql
 from invenio.modules.indexer.tokenizers.BibIndexMultiFieldTokenizer import BibIndexMultiFieldTokenizer
 from invenio.config import \
     CFG_CERN_SITE, \
     CFG_INSPIRE_SITE
 from invenio.legacy.bibindex.engine_utils import get_values_recursively
 from invenio.modules.records.api import get_record
 
 if CFG_CERN_SITE:
     CFG_JOURNAL_TAG = '773__%'
     CFG_JOURNAL_PUBINFO_STANDARD_FORM = "773__p 773__v (773__y) 773__c"
     CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK = r'^\w.*\s\w.*\s\(\d+\)\s\w.*$'
 elif CFG_INSPIRE_SITE:
     CFG_JOURNAL_TAG = '773__%'
     CFG_JOURNAL_PUBINFO_STANDARD_FORM = "773__p,773__v,773__c"
     CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK = r'^\w.*,\w.*,\w.*$'
 else:
     CFG_JOURNAL_TAG = '909C4%'
     CFG_JOURNAL_PUBINFO_STANDARD_FORM = "909C4p 909C4v (909C4y) 909C4c"
     CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK = r'^\w.*\s\w.*\s\(\d+\)\s\w.*$'
 
 
 class BibIndexJournalTokenizer(BibIndexMultiFieldTokenizer):
     """
         Tokenizer for journal index.
         Returns joined title/volume/year/page as a word from journal tag.
 
         Tokenizer works on multiple tags.
         For more information on tokenizers working on per-record basis
         take a look on BibIndexJournalTokenizer base class.
     """
 
     def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
         self.tag = CFG_JOURNAL_TAG
         self.nonmarc_tag = 'journal_info'
         self.journal_pubinfo_standard_form = CFG_JOURNAL_PUBINFO_STANDARD_FORM
         self.journal_pubinfo_standard_form_regexp_check = CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK
 
 
     def tokenize(self, recID):
         """
         Special procedure to extract words from journal tags.  Joins
         title/volume/year/page into a standard form that is also used for
         citations.
         """
         # get all journal tags/subfields:
         bibXXx = "bib" + self.tag[0] + self.tag[1] + "x"
         bibrec_bibXXx = "bibrec_" + bibXXx
         query = """SELECT bb.field_number,b.tag,b.value FROM %s AS b, %s AS bb
                     WHERE bb.id_bibrec=%%s
                       AND bb.id_bibxxx=b.id AND tag LIKE %%s""" % (bibXXx, bibrec_bibXXx)
         res = run_sql(query, (recID, self.tag))
         # construct journal pubinfo:
         dpubinfos = {}
         for row in res:
             nb_instance, subfield, value = row
             if subfield.endswith("c"):
                 # delete pageend if value is pagestart-pageend
                 # FIXME: pages may not be in 'c' subfield
                 value = value.split('-', 1)[0]
             if nb_instance in dpubinfos:
                 dpubinfos[nb_instance][subfield] = value
             else:
                 dpubinfos[nb_instance] = {subfield: value}
 
         # construct standard format:
         lwords = []
         for dpubinfo in dpubinfos.values():
             # index all journal subfields separately
             for tag, val in dpubinfo.items():
                 lwords.append(val)
             # index journal standard format:
             pubinfo = self.journal_pubinfo_standard_form
             for tag, val in dpubinfo.items():
                 pubinfo = pubinfo.replace(tag, val)
             if self.tag[:-1] in pubinfo:
                 # some subfield was missing, do nothing
                 pass
             else:
                 lwords.append(pubinfo)
 
         # return list of words and pubinfos:
         return lwords
 
     def tokenize_via_recjson(self, recID):
         """
         Tokenizes for journal info.
         Uses bibfield.
         """
         phrases = []
         rec = get_record(recID)
         recjson_field = rec.get(self.nonmarc_tag)
         get_values_recursively(recjson_field, phrases)
         final = []
         append = final.append
         for phrase in phrases:
             info = phrase.split("-", 1)
             append(info[0])
         return final
 
     def tokenize_for_words(self, recID):
         return self.tokenize(recID)
 
     def tokenize_for_pairs(self, recID):
         return self.tokenize(recID)
 
     def tokenize_for_phrases(self, recID):
         return self.tokenize(recID)
 
     def get_tokenizing_function(self, wordtable_type):
         return self.tokenize
 
     def get_nonmarc_tokenizing_function(self, table_type):
         return self.tokenize_via_recjson
diff --git a/invenio/modules/indexer/tokenizers/BibIndexMultiFieldTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexMultiFieldTokenizer.py
index 2a59e9313..5a73428f6 100644
--- a/invenio/modules/indexer/tokenizers/BibIndexMultiFieldTokenizer.py
+++ b/invenio/modules/indexer/tokenizers/BibIndexMultiFieldTokenizer.py
@@ -1,81 +1,81 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2010, 2011, 2012 CERN.
+# Copyright (C) 2010, 2011, 2012, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 """
     BibIndexMultiFieldTokenizer.
     Base class for tokenizers that work on more than one field
     and possibly on more than one phrase at a time.
 """
 
 
 from invenio.legacy.bibindex.engine_config import CFG_BIBINDEX_INDEX_TABLE_TYPE
 from invenio.modules.indexer.tokenizers.BibIndexTokenizer import BibIndexTokenizer
 
 
 
 class BibIndexMultiFieldTokenizer(BibIndexTokenizer):
     """
        BibIndexMultiFieldTokenizer is an abstract tokenizer.
        It should be used only for inheritance.
 
        This tokenizer should be a base class for more complicated
        tokenizers which tokenizing functions perform calculations
        on per record basis and NOT per string basis (look for
        BibIndexDefaultTokenizer if you want to know more about the
        latter type of tokenization).
 
        Tokenizing functions take as an argument recID of the record
        we want to perform calculations on.
        Example:
 
        class BibIndexComplicatedTokenizer(BibIndexMultiFieldTokenizer):
             (...)
        recID = 10
        a = BibIndexComplicatedTokenizer()
        res = a.tokenize_for_words(recID)
 
        Good examples of MultiFieldTokenizer are JournalTokenizer and
        AuthorCountTokenizer.
        Both return results after processing more than one field/tag
        of the record (for more information check these tokenizers).
 
     """
 
     def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
         """@param stemming_language: dummy
            @param remove_stopwords: dummy
            @param remove_html_markup: dummy
            @param remove_latex_markup: dummy
         """
         pass
 
 
     def get_tokenizing_function(self, wordtable_type):
         """Picks correct tokenize_for_xxx function depending on type of tokenization (wordtable_type)"""
         raise NotImplementedError
 
 
     def tokenize_for_words(self, recid):
         raise NotImplementedError
 
     def tokenize_for_pairs(self, recid):
         raise NotImplementedError
 
     def tokenize_for_phrases(self, recid):
         raise NotImplementedError
 
diff --git a/invenio/modules/indexer/tokenizers/BibIndexRecJsonTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexRecJsonTokenizer.py
index 6e840199c..160137e8e 100644
--- a/invenio/modules/indexer/tokenizers/BibIndexRecJsonTokenizer.py
+++ b/invenio/modules/indexer/tokenizers/BibIndexRecJsonTokenizer.py
@@ -1,70 +1,70 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2014 CERN.
+# Copyright (C) 2014, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """
     BibIndexRecJsonTokenizer.
     It's an abstract class created only for inheritance purposes.
 
     Tokenizers which are based on RecJsonTokenizer use bibfield/JsonAlchemy records.
     Logic of the tokenization process is in the functions of bibfield module.
     Tokenizer itself should perform only necessary post-processing.
 """
 
 
 from invenio.modules.indexer.tokenizers.BibIndexTokenizer import BibIndexTokenizer
 
 
 class BibIndexRecJsonTokenizer(BibIndexTokenizer):
     """
         BibIndexRecJsonTokenizer is an abstract tokenizer.
         It should be used only for inheritance.
 
         It should be a base class for all tokenizers which need to use
         bibfield/JsonAlchemy records.
 
         Tokenizing function of RecJsonTokenizer takes a bibfield record
         as an argument.
         Main logic of tokenization process stays in bibfield record's
         functions. Tokenizing functions of all tokenizers inheriting after
         RecJsonTokenizer should only do post-processing tasks.
 
         For example of use please check: BibIndexFiletypeTokenizer
     """
 
     def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
         pass
 
 
     def tokenize(self, record):
         """'record' is a recjson record from bibfield module
            @param urls: recjson record
         """
         raise NotImplementedError
 
     def get_tokenizing_function(self, wordtable_type):
         raise NotImplementedError
 
     def tokenize_for_words(self, recid):
         raise NotImplementedError
 
     def tokenize_for_pairs(self, recid):
         raise NotImplementedError
 
     def tokenize_for_phrases(self, recid):
         raise NotImplementedError
diff --git a/invenio/modules/indexer/tokenizers/BibIndexStringTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexStringTokenizer.py
index 69f54c34b..05c1dad3c 100644
--- a/invenio/modules/indexer/tokenizers/BibIndexStringTokenizer.py
+++ b/invenio/modules/indexer/tokenizers/BibIndexStringTokenizer.py
@@ -1,65 +1,65 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2010, 2011, 2012 CERN.
+# Copyright (C) 2010, 2011, 2012, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 """
     Abstract BibIndexStringTokenizer.
     It is a tokenizer created only for inheritance.
     All string based tokenizers should inherit after this tokenizer.
 """
 
 from invenio.modules.indexer.tokenizers.BibIndexTokenizer import BibIndexTokenizer
 
 
 
 class BibIndexStringTokenizer(BibIndexTokenizer):
     """
        BibIndexStringTokenizer is an abstract tokenizer.
        It should be used only for inheritance.
 
        This tokenizer should be a base class for tokenizers
        which operates on strings/phrases and splits them
        into multiple terms/tokens.
 
        Tokenizing functions take phrase as an argument.
 
        Good examples of StringTokenizer is DeafultTokenizer.
     """
 
     def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
         """@param stemming_language: dummy
            @param remove_stopwords: dummy
            @param remove_html_markup: dummy
            @param remove_latex_markup: dummy
         """
         pass
 
 
     def get_tokenizing_function(self, wordtable_type):
         """Picks correct tokenize_for_xxx function depending on type of tokenization (wordtable_type)"""
         raise NotImplementedError
 
 
     def tokenize_for_words(self, phrase):
         raise NotImplementedError
 
     def tokenize_for_pairs(self, phrase):
         raise NotImplementedError
 
     def tokenize_for_phrases(self, phrase):
         raise NotImplementedError
 
diff --git a/invenio/modules/indexer/tokenizers/BibIndexTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexTokenizer.py
index 3cb71e63f..bc06ea4ae 100644
--- a/invenio/modules/indexer/tokenizers/BibIndexTokenizer.py
+++ b/invenio/modules/indexer/tokenizers/BibIndexTokenizer.py
@@ -1,176 +1,176 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2010, 2011, 2012 CERN.
+# Copyright (C) 2010, 2011, 2012, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 """
     BibIndexTokenizer: generic, not implemented tokenizer for inheritance
 
     Inheritance tree for tokenizers in Invenio:
 
     BibIndexTokenizer
     ^
     |
     |----BibIndexStringTokenizer<---|
     |                               |
     |                       BibIndexDefaultTokenizer<---|
     |                                                   |
     |                                       BibIndexAuthorTokenizer
     |                                       BibIndexExactAuthorTokenizer
     |                                       (...)
     |
     |----BibIndexRecJsonTokenizer<---|
     |                                |
     |                        BibIndexFiletypeTokenizer
     |                        (...)
     |
     |----BibIndexMultiFieldTokenizer<---|
                                         |
                             BibIndexJournalTokenizer
                             BibIndexAuthorCountTokenizer
                             (...)
 """
 
 
 class BibIndexTokenizer(object):
     """
         Base class for the tokenizers.
 
         Tokenizers are components that find terms which need to be
         indexed and stored in DB.
         Different types of tokenizers work in different ways.
         Tokenizers are divided into three groups:
         - tokenizers that take string as an input and split it into
           tokens/terms which later are indexed
         - tokenizers that take recID of the record and find terms
           by processing many fields/tags from the record
         - tokenizers that use bibfield module and their functions
           which precomputes terms to index
     """
     #words part
     def scan_string_for_words(self, s):
         """Return an intermediate representation of the tokens in s.
 
         Every tokenizer should have a scan_string function, which scans the
         input string and lexically tags its components.  These units are
         grouped together sequentially.  The output of scan_string is usually
         something like:
         {
             'TOKEN_TAG_LIST' : a list of valid keys in this output set,
             'key1' : [val1, val2, val3] - where key describes the in some
                       meaningful way
         }
 
         @param s: the input to be lexically tagged
         @type s: string
 
         @return: dict of lexically tagged input items
             In a sample Tokenizer where scan_string simply splits s on
             space, scan_string might output the following for
             "Assam and Darjeeling":
             {
                 'TOKEN_TAG_LIST' : 'word_list',
                 'word_list'     : ['Assam', 'and', 'Darjeeling']
             }
         @rtype: dict
         """
         raise NotImplementedError
 
     def parse_scanned_for_words(self, o):
         """Calculate the token list from the intermediate representation o.
 
         While this should be an interesting computation over the intermediate
         representation generated by scan_string, obviously in the split-on-
         space example we need only return o['word_list'].
 
         @param t: a dictionary with a 'word_list' key
         @type t: dict
 
         @return: the token items from 'word_list'
         @rtype: list of string
         """
         raise NotImplementedError
 
     def tokenize_for_words(self, s):
         """Main entry point.  Return token list from input string s.
 
         Simply composes the functionality above.
 
         @param s: the input to be lexically tagged
         @type s: string
 
         @return: the token items derived from s
         @rtype: list of string
         """
         raise NotImplementedError
 
     #pairs part
     def scan_string_for_pairs(self, s):
         """ See: scan_string_for_words """
         raise NotImplementedError
 
     def parse_scanned_for_pairs(self, o):
         """ See: parse_scanned_for_words """
         raise NotImplementedError
 
     def tokenize_for_pairs(self, s):
         """ See: tokenize_for_words """
         raise NotImplementedError
 
     #phrases part
     def scan_string_for_phrases(self, s):
         """ See: scan_string_for_words """
         raise NotImplementedError
 
     def parse_scanned_for_phrases(self, o):
         """ See: parse_scanned_for_words """
         raise NotImplementedError
 
     def tokenize_for_phrases(self, s):
         """ See: tokenize_for_words """
         raise NotImplementedError
 
     def get_tokenizing_function(self, wordtable_type):
         """Chooses tokenize_for_words, tokenize_for_phrases or tokenize_for_pairs
            depending on type of tokenization we want to perform."""
         raise NotImplementedError
 
     def get_nonmarc_tokenizing_function(self, table_type):
         """Chooses best tokenizing function
            depending on type of tokenization we want to perform.
            Non-marc version.
         """
         raise NotImplementedError
 
     @property
     def implemented(self):
         try:
             self.get_tokenizing_function("")
         except NotImplementedError:
             return False
         except AttributeError:
             return False
         return True
 
     @property
     def implemented_nonmarc(self):
         try:
             self.get_nonmarc_tokenizing_function("")
         except NotImplementedError:
             return False
         except AttributeError:
             return False
-        return True
\ No newline at end of file
+        return True
diff --git a/invenio/modules/indexer/tokenizers/BibIndexYearTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexYearTokenizer.py
index 38cf6fb75..8d3b5b20c 100644
--- a/invenio/modules/indexer/tokenizers/BibIndexYearTokenizer.py
+++ b/invenio/modules/indexer/tokenizers/BibIndexYearTokenizer.py
@@ -1,71 +1,71 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2010, 2011, 2012 CERN.
+# Copyright (C) 2010, 2011, 2012, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 """BibIndexYearTokenizer: useful for year index. Extracts words (year) from date tags.
 """
 
 from invenio.config import \
      CFG_INSPIRE_SITE
 from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import BibIndexDefaultTokenizer
 
 
 
 class BibIndexYearTokenizer(BibIndexDefaultTokenizer):
     """
        Year tokenizer. It tokenizes words from date tags or uses default word tokenizer.
     """
 
     def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False):
         BibIndexDefaultTokenizer.__init__(self, stemming_language,
                                                 remove_stopwords,
                                                 remove_html_markup,
                                                 remove_latex_markup)
 
 
     def get_words_from_date_tag(self, datestring):
         """
         Special procedure to index words from tags storing date-like
         information in format YYYY or YYYY-MM or YYYY-MM-DD.  Namely, we
         are indexing word-terms YYYY, YYYY-MM, YYYY-MM-DD, but never
         standalone MM or DD.
         """
         out = []
         for dateword in datestring.split():
             # maybe there are whitespaces, so break these too
             out.append(dateword)
             parts = dateword.split('-')
             for nb in range(1, len(parts)):
                 out.append("-".join(parts[:nb]))
         return out
 
 
     def tokenize_for_words_default(self, phrase):
         """Default tokenize_for_words inherited from default tokenizer"""
         return super(BibIndexYearTokenizer, self).tokenize_for_words(phrase)
 
 
     def tokenize_for_words(self, phrase):
         """
             If CFG_INSPIRE_SITE is 1 we perform special tokenization which relies on getting words form date tag.
             In other case we perform default tokenization.
         """
         if CFG_INSPIRE_SITE:
             return self.get_words_from_date_tag(phrase)
         else:
             return self.tokenize_for_words_default(phrase)
 
diff --git a/invenio/modules/records/recordext/functions/get_pubinfo_standard_form.py b/invenio/modules/records/recordext/functions/get_pubinfo_standard_form.py
index cd07cd9f4..36383b1ae 100644
--- a/invenio/modules/records/recordext/functions/get_pubinfo_standard_form.py
+++ b/invenio/modules/records/recordext/functions/get_pubinfo_standard_form.py
@@ -1,27 +1,27 @@
-# -*- coding:utf-8 -*-
+# -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2010, 2011, 2012 CERN.
+# Copyright (C) 2010, 2011, 2012, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 def get_pubinfo_standard_form(pubinfo):
     """
 
     """
     if all(key in pubinfo for key in ('c', 'p', 'v', 'y', )):
         return '%s %s (%s) %s' % (pubinfo['p'], pubinfo['v'], pubinfo['y'], pubinfo['c'].split("-", 1)[0], )
     else:
-        return None
\ No newline at end of file
+        return None
diff --git a/invenio/testsuite/test_legacy_dbquery.py b/invenio/testsuite/test_legacy_dbquery.py
index 979c4535a..1625bd75d 100644
--- a/invenio/testsuite/test_legacy_dbquery.py
+++ b/invenio/testsuite/test_legacy_dbquery.py
@@ -1,111 +1,110 @@
-
 # -*- coding: utf-8 -*-
 #
 # This file is part of Invenio.
-# Copyright (C) 2006, 2007, 2008, 2010, 2011, 2013 CERN.
+# Copyright (C) 2006, 2007, 2008, 2010, 2011, 2013, 2015 CERN.
 #
 # Invenio is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License as
 # published by the Free Software Foundation; either version 2 of the
 # License, or (at your option) any later version.
 #
 # Invenio is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with Invenio; if not, write to the Free Software Foundation, Inc.,
 # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """Unit tests for dbquery library."""
 
 __revision__ = "$Id$"
 
 from invenio.base.wrappers import lazy_import
 from invenio.testsuite import make_test_suite, run_test_suite, InvenioTestCase
 
 dbquery = lazy_import('invenio.legacy.dbquery')
 
 
 class TableUpdateTimesTest(InvenioTestCase):
     """Test functions related to the update_times of MySQL tables."""
 
     def _check_table_update_time(self, tablename):
         """Helper function to check update time of TABLENAME."""
         from invenio.base.globals import cfg
         # detect MySQL version number:
         res = dbquery.run_sql("SELECT VERSION()")
         mysql_server_version = res[0][0]
         if mysql_server_version.startswith("5."):
             # MySQL-5 provides INFORMATION_SCHEMA:
             query = """SELECT UPDATE_TIME FROM INFORMATION_SCHEMA.TABLES
                         WHERE table_name='%s' AND table_schema='%s'""" \
                         % (tablename, cfg['CFG_DATABASE_NAME'])
             tablename_update_time = str(dbquery.run_sql(query)[0][0])
         elif mysql_server_version.startswith("4.1"):
             # MySQL-4.1 has it on 12th position:
             query = """SHOW TABLE STATUS LIKE '%s'""" % tablename
             tablename_update_time = str(dbquery.run_sql(query)[0][12])
         elif mysql_server_version.startswith("4.0"):
             # MySQL-4.0 has it on 11th position:
             query = """SHOW TABLE STATUS LIKE '%s'""" % tablename
             tablename_update_time = str(dbquery.run_sql(query)[0][11])
         else:
             tablename_update_time = "MYSQL SERVER VERSION NOT DETECTED"
         # compare it with the one detected by the function:
         self.assertEqual(tablename_update_time,
                          dbquery.get_table_update_time(tablename))
 
     def test_single_table_update_time(self):
         """dbquery - single table (with indexes) update time detection"""
         # NOTE: this tests usual "long" branch of
         # get_table_update_time()
         self._check_table_update_time("collection")
 
     def test_empty_table_update_time(self):
         """dbquery - empty table (no indexes) update time detection"""
         # NOTE: this tests unusual "None" branch of
         # get_table_update_time()
         # create empty test table
         test_table = "tmpTESTTABLE123"
         dbquery.run_sql("CREATE TABLE IF NOT EXISTS %s (a INT)" % test_table)
         # run the test:
         self._check_table_update_time(test_table)
         # drop empty test table
         dbquery.run_sql("DROP TABLE %s" % test_table)
 
     def test_utf8_python_mysqldb_mysql_storage_chain(self):
         """dbquery - UTF-8 in Python<->MySQLdb<->MySQL storage chain"""
         # NOTE: This test test creates, uses and destroys a temporary
         # table called "test__invenio__utf8".
         beta_in_utf8 = "β" # Greek beta in UTF-8 is 0xCEB2
         dbquery.run_sql("CREATE TEMPORARY TABLE test__invenio__utf8 (x char(1), y varbinary(2)) DEFAULT CHARACTER SET utf8")
         dbquery.run_sql("INSERT INTO test__invenio__utf8 (x, y) VALUES (%s, %s)", (beta_in_utf8, beta_in_utf8))
         res = dbquery.run_sql("SELECT x,y,HEX(x),HEX(y),LENGTH(x),LENGTH(y),CHAR_LENGTH(x),CHAR_LENGTH(y) FROM test__invenio__utf8")
         self.assertEqual(res[0],
                          ('\xce\xb2', '\xce\xb2', 'CEB2', 'CEB2', 2L, 2L, 1L, 2L))
         dbquery.run_sql("DROP TEMPORARY TABLE test__invenio__utf8")
 
 class WashTableColumnNameTest(InvenioTestCase):
     """Test if wash_table_column_name and real_escape_string evaluates correctly."""
 
     def test_wash_table_column_name(self):
        """dbquery - wash table column name"""
        testcase_error = "foo ; bar"
        testcase_ok = "foo_bar"
        self.assertRaises(Exception, dbquery.wash_table_column_name, testcase_error)
        self.assertEqual(testcase_ok, dbquery.wash_table_column_name(testcase_ok))
 
     def test_real_escape_string(self):
         """dbquery - real escape string"""
         testcase_ok = "Programmer"
         testcase_injection = "' OR ''='"
         self.assertEqual(dbquery.real_escape_string(testcase_ok), testcase_ok)
         self.assertNotEqual(dbquery.real_escape_string(testcase_injection), testcase_injection)
 
 
 TEST_SUITE = make_test_suite(TableUpdateTimesTest, WashTableColumnNameTest)
 
 if __name__ == "__main__":
     run_test_suite(TEST_SUITE)