diff --git a/modules/bibclassify/lib/bibclassify_cli.py b/modules/bibclassify/lib/bibclassify_cli.py
index ac9b1160f..eabddd59b 100644
--- a/modules/bibclassify/lib/bibclassify_cli.py
+++ b/modules/bibclassify/lib/bibclassify_cli.py
@@ -1,260 +1,260 @@
 # -*- coding: utf-8 -*-
 ##
 ## This file is part of CDS Invenio.
 ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
 ##
 ## CDS Invenio is free software; you can redistribute it and/or
 ## modify it under the terms of the GNU General Public License as
 ## published by the Free Software Foundation; either version 2 of the
 ## License, or (at your option) any later version.
 ##
 ## CDS Invenio is distributed in the hope that it will be useful, but
 ## WITHOUT ANY WARRANTY; without even the implied warranty of
 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ## General Public License for more details.
 ##
 ## You should have received a copy of the GNU General Public License
 ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
 ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """
 Bibclassify keyword extractor command line entry point.
 """
 
 __revision__ = "$Id$"
 
 import getopt
 import os
 import sys
 import time
 
 try:
     from bibclassifylib import get_regular_expressions, \
         get_keywords_from_text, check_ontology
     from bibclassify_text_extractor import text_lines_from_local_file, \
         text_lines_from_url, is_pdf
-    from bibclassify_config import *
+    from bibclassify_config import CFG_BIBCLASSIFY_USER_AGENT
 except ImportError, err:
     print >> sys.stderr, "Error: %s" % err
     sys.exit(1)
 
 # Retrieve the custom configuration if it exists.
 try:
     from bibclassify_config_local import *
 except ImportError:
     # No local configuration was found.
     pass
 
 _OPTIONS = {}
 
 def display_help():
     """Prints the help message for this module."""
     print >> sys.stdout, """Usage: bibclassify [OPTION]... [FILE/URL]...
   or:  bibclassify [OPTION]... [DIRECTORY]...
 Searches keywords in FILEs and/or files in DIRECTORY(ies). If a directory is
 specified, BibClassify will generate keywords for all PDF documents contained
 in the directory.
 
   -h, --help                display this help and exit
   -V, --version             output version information and exit
   -v, --verbose LEVEL       sets the verbose to LEVEL (=0)
   -k, --ontology FILE       sets the FILE to read the ontology from
   -o, --output-mode TYPE    changes the output format to TYPE (text, marcxml or
                               html) (=text)
   -s, --spires              outputs keywords in the SPIRES format
   -n, --keywords-number INT sets the number of keywords displayed (=20), use 0
                               to set no limit
   -m, --matching-mode TYPE  changes the search mode to TYPE (full or partial)
                               (=full)
   --detect-author-keywords  detect keywords that are explicitely written in the
                               document
   --check-ontology          checks the ontology and reports warnings and errors
   --rebuild-cache           ignores the existing cache and regenerates it
   --no-cache                don't cache the ontology
 
 Backward compatibility (using these options is discouraged):
   -q                        equivalent to -s
   -f FILE URL               sets the file to read the keywords from
 
 Example:
     python bibclassifycli.py -k etc/HEP.rdf http://arxiv.org/pdf/0808.1825
     python bibclassifycli.py -k etc/HEP.rdf article.pdf
     python bibclassifycli.py -k etc/HEP.rdf directory/"""
     sys.exit(0)
 
 def main():
     """Main function """
     read_options(sys.argv[1:])
 
     # Ontology check
     if _OPTIONS["check_ontology"]:
         print >> sys.stdout, ("Checking ontology file %s" %
             _OPTIONS["ontology_file"])
         check_ontology(_OPTIONS["ontology_file"])
     # End of ontology check.
 
     # Initialize cache
     get_regular_expressions(_OPTIONS["ontology_file"],
                             _OPTIONS["rebuild_cache"],
                             _OPTIONS["no_cache"])
 
     sources = {}
     for entry in _OPTIONS["text_files"]:
         text_lines = None
         if os.path.isdir(entry):
             for filename in os.listdir(entry):
                 if (os.path.isfile(entry + filename) and
                     is_pdf(entry + filename)):
                     text_lines = text_lines_from_local_file(entry + filename)
                     sources[filename] = text_lines
         elif os.path.isfile(entry):
             text_lines = text_lines_from_local_file(entry)
             sources[os.path.basename(entry)] = text_lines
         else:
             # Treat as a URL.
             text_lines = text_lines_from_url(entry,
                 user_agent=CFG_BIBCLASSIFY_USER_AGENT)
             sources[entry.split("/")[-1]] = text_lines
 
     # For each identified source, check the keywords and output them.
     for source, text_lines in sources.iteritems():
         if _OPTIONS["output_mode"] == "text":
             print >> sys.stdout, source
         print >> sys.stdout, get_keywords_from_text(text_lines,
             output_mode=_OPTIONS["output_mode"],
             output_limit=_OPTIONS["output_limit"],
             spires=_OPTIONS["spires"],
             match_mode=_OPTIONS["match_mode"],
             with_author_keywords=_OPTIONS["with_author_keywords"])
 
 def read_options(options_string):
     """Reads the options, test if the specified values are consistent and
     populates the options dictionary."""
     global _OPTIONS
     _OPTIONS = {}
     _OPTIONS["spires"] = False
     _OPTIONS["output_limit"] = 20
     _OPTIONS["text_files"] = []
     _OPTIONS["ontology_file"] = ""
     _OPTIONS["output_mode"] = "text"
     _OPTIONS["verbose"] = 0
     _OPTIONS["match_mode"] = "full"
     _OPTIONS["output_prefix"] = None
     _OPTIONS["rebuild_cache"] = False
     _OPTIONS["no_cache"] = False
     _OPTIONS["check_ontology"] = False
     _OPTIONS["with_author_keywords"] = False
 
     output_modes = ("html", "text", "marcxml")
     modes = ("full", "partial")
 
     try:
         long_flags = ["ontology=", "output-mode=", "verbose=", "spires",
                       "keywords-number=", "matching-mode=", "help", "version",
                       "file", "rebuild-cache", "no-limit", "no-cache",
                       "check-ontology", "detect-author-keywords"]
         short_flags = "f:k:o:n:m:v:sqhV"
         opts, args = getopt.gnu_getopt(options_string, short_flags, long_flags)
     except getopt.GetoptError, err1:
         print >> sys.stderr, "Options problem: %s" % err1
         usage()
 
     for opt, arg in opts:
         if opt in ("-h", "--help"):
             display_help()
         elif opt in ("-V", "--version"):
             try:
                 from invenio.config import CFG_VERSION
                 print >> sys.stdout, ("CDS Invenio/%s bibclassify/%s" %
                     (CFG_VERSION, CFG_VERSION))
             except ImportError:
                 print >> sys.stdout, "CDS Invenio bibclassify/standalone"
             sys.exit(1)
         elif opt in ("-v", "--verbose"):
             _OPTIONS["verbose"] = arg
         elif opt in ("-k", "--ontology"):
             _OPTIONS["ontology_file"] = arg
         elif opt in ("-o", "--output-mode"):
             _OPTIONS["output_mode"] = arg.lower()
         elif opt in ("-m", "--matching-mode"):
             _OPTIONS["match_mode"] = arg.lower()
         # -q for backward compatibility
         elif opt in ("-s", "--spires", "-q"):
             _OPTIONS["spires"] = True
         elif opt in ("-n", "--nkeywords"):
             _OPTIONS["output_limit"] = arg
         elif opt == "--rebuild-cache":
             _OPTIONS["rebuild_cache"] = True
         elif opt == "--no-cache":
             _OPTIONS["no_cache"] = True
         elif opt == "--write-to-file":
             _OPTIONS["output_prefix"] = arg
         # -f for compatibility reasons
         elif opt in ("-f", "--file"):
             _OPTIONS["text_files"].append(arg)
         elif opt == "--check-ontology":
             _OPTIONS["check_ontology"] = True
         elif opt == "--detect-author-keywords":
             _OPTIONS["with_author_keywords"] = True
 
     if not opts and not args:
         display_help()
 
     _OPTIONS["text_files"] += args
 
     # Test if the options are consistent.
     if not args:
         if not _OPTIONS["check_ontology"] and not _OPTIONS["text_files"]:
             print >> sys.stderr, "ERROR: please specify a file or directory."
             usage()
     if not _OPTIONS["ontology_file"]:
         print >> sys.stderr, "ERROR: please specify an ontology file (-k)."
         usage()
     if _OPTIONS["output_mode"] not in output_modes:
         print >> sys.stderr, ("ERROR: output (-o) should be TEXT, MARCXML or "
             "HTML.")
         usage()
     if _OPTIONS["match_mode"] not in modes:
         print >> sys.stderr, "ERROR: mode (-m) should be FULL or PARTIAL."
         usage()
     try:
         _OPTIONS["output_limit"] = int(_OPTIONS["output_limit"])
         if _OPTIONS["output_limit"] < 0:
             print >> sys.stderr, ("ERROR: output limit must be a positive "
                 "integer.")
     except ValueError:
         print >> sys.stderr, ("ERROR: output limit must be a positive "
             "integer.")
         usage()
 
 def usage():
     """Displays usage (single line) and exit."""
     # TODO: write usage
     display_help()
     sys.exit(1)
 
 def version():
     """Display BibClassify version and exit."""
     # TODO
     display_help()
     sys.exit(0)
 
 def write_message(msg, stream=sys.stdout, verbose=1):
     """Write message and flush output stream (may be sys.stdout or sys.stderr).
     Useful for debugging stuff. Copied from bibtask.py."""
     if msg and _OPTIONS["verbose"] >= verbose:
         if stream == sys.stdout or stream == sys.stderr:
             stream.write(time.strftime("%Y-%m-%d %H:%M:%S --> ",
                                        time.localtime()))
             try:
                 stream.write("%s\n" % msg)
             except UnicodeEncodeError:
                 stream.write("%s\n" % msg.encode('ascii', 'backslashreplace'))
                 stream.flush()
         else:
             sys.stderr.write("Unknown stream %s. [must be sys.stdout or "
                              "sys.stderr]\n" % stream)
 
 if __name__ == '__main__':
     main()
 
diff --git a/modules/bibclassify/lib/bibclassify_keyword_analyser.py b/modules/bibclassify/lib/bibclassify_keyword_analyser.py
index cf042a0e7..1ec9bec68 100644
--- a/modules/bibclassify/lib/bibclassify_keyword_analyser.py
+++ b/modules/bibclassify/lib/bibclassify_keyword_analyser.py
@@ -1,208 +1,211 @@
 # -*- coding: utf-8 -*-
 ##
 ## This file is part of CDS Invenio.
 ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
 ##
 ## CDS Invenio is free software; you can redistribute it and/or
 ## modify it under the terms of the GNU General Public License as
 ## published by the Free Software Foundation; either version 2 of the
 ## License, or (at your option) any later version.
 ##
 ## CDS Invenio is distributed in the hope that it will be useful, but
 ## WITHOUT ANY WARRANTY; without even the implied warranty of
 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ## General Public License for more details.
 ##
 ## You should have received a copy of the GNU General Public License
 ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
 ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """
 Bibclassify keyword analysing methods.
 """
 
 __revision__ = "$Id$"
 
 import sys
 
 try:
-    from bibclassify_config import *
+    from bibclassify_config import CFG_BIBCLASSIFY_VALID_SEPARATORS, \
+        CFG_BIBCLASSIFY_AUTHOR_KW_START, \
+        CFG_BIBCLASSIFY_AUTHOR_KW_END, \
+        CFG_BIBCLASSIFY_AUTHOR_KW_SEPARATION
 except ImportError, err:
     print >> sys.stderr, "Error: %s" % err
     sys.exit(1)
 
 # Retrieve the custom configuration if it exists.
 try:
     from bibclassify_config_local import *
 except ImportError:
     # No local configuration was found.
     pass
 
 _MAXIMUM_SEPARATOR_LENGTH = max([len(_separator)
     for _separator in CFG_BIBCLASSIFY_VALID_SEPARATORS])
 
 def _get_ckw_span(fulltext, spans):
     """Returns the span of the composite keyword if it is valid. Returns
     None otherwise."""
     if spans[0] < spans[1]:
         words = (spans[0], spans[1])
         dist = spans[1][0] - spans[0][1]
     else:
         words = (spans[1], spans[0])
         dist = spans[0][0] - spans[1][1]
 
     if dist == 0:
         # Two keywords are adjacent. We have a match.
         return (min(words[0] + words[1]), max(words[0] + words[1]))
     elif dist <= _MAXIMUM_SEPARATOR_LENGTH:
         separator = fulltext[words[0][1]:words[1][0] + 1]
         # Check the separator.
         if separator.strip() in CFG_BIBCLASSIFY_VALID_SEPARATORS:
             return (min(words[0] + words[1]), max(words[0] + words[1]))
 
     # There is no inclusion.
     return None
 
 def get_composite_keywords(ckw_db, fulltext, skw_spans):
     """Returns a list of composite keywords bound with the number of
     occurrences found in the text string.
     Format of the output list is (subject, count, component counts)."""
     # Build the list of composite candidates
     ckw_list = []
 
     skw_as_components = []
 
     for subject, composite in ckw_db.iteritems():
         # Counters for the composite keyword. First count is for the
         # number of occurrences in the whole document and second count
         # is for the human defined keywords.
         ckw_count = 0
         matched_spans = []
 
         # Check the alternative labels.
         for regex in composite.regex:
             for match in regex.finditer(fulltext):
                 span = list(match.span())
                 span[1] -= 1
                 span = tuple(span)
                 if not span in matched_spans:
                     ckw_count += 1
                     matched_spans.append(span)
 
         # Get the single keywords locations.
         try:
             components = ckw_db[subject].compositeof
         except AttributeError:
             print >> sys.stderr, ("Cached ontology is corrupted. Please "
                 "remove the cached ontology in your temporary file.")
             sys.exit(1)
         try:
             spans = [skw_spans[component] for component in components]
         except KeyError:
             # The keyword components are not to be found in the text.
             # This is not a dramatic exception and we can safely ignore
             # it.
             pass
         else:
             ckw_spans = []
             for index in range(len(spans) - 1):
                 if ckw_spans:
                     previous_spans = ckw_spans
                 else:
                     previous_spans = spans[index]
 
                 ckw_spans = []
                 for new_span in [(span0, span1) for span0 in previous_spans
                                                 for span1 in spans[index + 1]]:
                     span = _get_ckw_span(fulltext, new_span)
                     if span is not None:
                         ckw_spans.append(span)
 
             for span in [span for span in ckw_spans
                               if not span in matched_spans]:
                 ckw_count += 1
                 matched_spans.append(span)
 
         if ckw_count:
             # Gather the component counts.
             component_counts = []
             for component in components:
                 skw_as_components.append(component)
                 # Get the single keyword count.
                 try:
                     component_counts.append(len(skw_spans[component]))
                 except KeyError:
                     component_counts.append(0)
 
             # Store the composite keyword
             ckw_list.append((subject, ckw_count, component_counts))
 
     # Remove the single keywords that appear as components from the list
     # of single keywords.
     for skw in skw_as_components:
         try:
             del skw_spans[skw]
         except KeyError:
             pass
 
     return ckw_list
 
 def get_author_keywords(fulltext):
     """Finds out human defined keyowrds in a text string. Searches for
     the string "Keywords:" and its declinations and matches the
     following words."""
     split_string = CFG_BIBCLASSIFY_AUTHOR_KW_START.split(fulltext, 1)
     if len(split_string) == 1:
         return []
 
     kw_string = split_string[1]
 
     for regex in CFG_BIBCLASSIFY_AUTHOR_KW_END:
         parts = regex.split(kw_string, 1)
         kw_string = parts[0]
 
     # We separate the keywords.
     return CFG_BIBCLASSIFY_AUTHOR_KW_SEPARATION.split(kw_string)
 
 def _contains_span(span0, span1):
     """Return true if span0 contains span1, False otherwise."""
     if (span0 == span1 or
         span0[0] > span1[0] or
         span0[1] < span1[1]):
         return False
     return True
 
 def get_single_keywords(skw_db, fulltext):
     """Returns a dictionary of single keywords bound with the positions
     of the matches in the fulltext.
     Format of the output dictionary is (subject: positions)."""
     # Matched span -> subject
     records = []
 
     for subject, single_keyword in skw_db.iteritems():
         for regex in single_keyword.regex:
             for match in regex.finditer(fulltext):
                 # Modify the right index to put it on the last letter
                 # of the word.
                 span = (match.span()[0], match.span()[1] - 1)
 
                 # Remove the previous records contained by this span
                 records = [record for record in records
                                   if not _contains_span(span, record[0])]
 
                 add = True
                 for previous_record in records:
                     if ((span, subject) == previous_record or
                         _contains_span(previous_record[0], span)):
                         # Match is contained by a previous match.
                         add = False
                         break
 
                 if add:
                     records.append((span, subject))
 
     # List of single_keywords: {spans: subject}
     single_keywords = {}
     for span, subject in records:
         single_keywords.setdefault(subject, []).append(span)
     return single_keywords
diff --git a/modules/bibclassify/lib/bibclassifylib.py b/modules/bibclassify/lib/bibclassifylib.py
index a3e4df6eb..d4d7e8b43 100644
--- a/modules/bibclassify/lib/bibclassifylib.py
+++ b/modules/bibclassify/lib/bibclassifylib.py
@@ -1,868 +1,876 @@
 # -*- coding: utf-8 -*-
 ##
 ## This file is part of CDS Invenio.
 ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN.
 ##
 ## CDS Invenio is free software; you can redistribute it and/or
 ## modify it under the terms of the GNU General Public License as
 ## published by the Free Software Foundation; either version 2 of the
 ## License, or (at your option) any later version.
 ##
 ## CDS Invenio is distributed in the hope that it will be useful, but
 ## WITHOUT ANY WARRANTY; without even the implied warranty of
 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 ## General Public License for more details.
 ##
 ## You should have received a copy of the GNU General Public License
 ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
 ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 
 """
 Bibclassify keyword extractor command line entry point.
 """
 
 __revision__ = "$Id$"
 
 import os
 import rdflib
 import re
 import random
 import cPickle
 import sys
 import tempfile
 
 try:
     from bibclassify_text_normalizer import normalize_fulltext, cut_references
     from bibclassify_keyword_analyser import get_single_keywords, \
                                              get_composite_keywords, \
                                              get_author_keywords
-    from bibclassify_config import *
+    from bibclassify_config import CFG_BIBCLASSIFY_WORD_WRAP, \
+        CFG_BIBCLASSIFY_INVARIABLE_WORDS, \
+        CFG_BIBCLASSIFY_EXCEPTIONS, \
+        CFG_BIBCLASSIFY_UNCHANGE_REGULAR_EXPRESSIONS, \
+        CFG_BIBCLASSIFY_GENERAL_REGULAR_EXPRESSIONS, \
+        CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER, \
+        CFG_BIBCLASSIFY_PARTIAL_TEXT, \
+        CFG_BIBCLASSIFY_SYMBOLS, \
+        CFG_BIBCLASSIFY_SEPARATORS
 except ImportError, err:
     print >> sys.stderr, "Error: %s" % err
     sys.exit(1)
 
 # Retrieve the custom configuration if it exists.
 try:
     from bibclassify_config_local import *
 except ImportError:
     # No local configuration was found.
     pass
 
 
 # Global variables.
 _SKWS = {}
 _CKWS = {}
 
 _contains_digit = re.compile("\d")
 _starts_with_non = re.compile("(?i)^non[a-z]")
 _starts_with_anti = re.compile("(?i)^anti[a-z]")
 _split_by_punctuation = re.compile("(\W+)")
 
 class SingleKeyword:
     """A single keyword element that treats and stores information fields
     retrieved from the RDF/SKOS ontology."""
     def __init__(self, store, namespace, subject):
         basic_labels = []
         for label in store.objects(subject, namespace["prefLabel"]):
             basic_labels.append(str(label))
 
         # The concept (==human-readable label of the keyword) is the first
         # prefLabel.
         self.concept = basic_labels[0]
 
         for label in store.objects(subject, namespace["altLabel"]):
             basic_labels.append(str(label))
 
         hidden_labels = []
         for label in store.objects(subject, namespace["hiddenLabel"]):
             hidden_labels.append(unicode(label))
 
         self.regex = get_searchable_regex(basic_labels, hidden_labels)
 
         note = str(store.value(subject, namespace["note"], any=True))
         if note is not None:
             self.nostandalone = (note.lower() in
                                 ("nostandalone", "nonstandalone"))
 
         spires = store.value(subject, namespace["spiresLabel"], any=True)
         if spires is not None:
             self.spires = str(spires)
 
 
     def __repr__(self):
         return "".join(["<SingleKeyword: ", self.concept, ">"])
 
 class CompositeKeyword:
     """A composite keyword element that treats and stores information fields
     retrieved from the RDF/SKOS ontology."""
     def __init__(self, store, namespace, subject):
         try:
             self.concept = store.value(subject, namespace["prefLabel"],
                                        any=True)
         except KeyError:
             # Keyword has no prefLabel. We can discard that error.
             print >> sys.stderr, ("Keyword with subject %s has no prefLabel" %
                 subject)
 
         small_subject = subject.split("#Composite.")[-1]
         component_positions = []
         for label in store.objects(subject, namespace["compositeOf"]):
             strlabel = str(label).split("#")[-1]
             component_name = label.split("#")[-1]
             component_positions.append((small_subject.find(component_name),
                 strlabel))
 
         self.compositeof = []
         component_positions.sort()
         for position in component_positions:
             self.compositeof.append(position[1])
 
         spires = store.value(subject, namespace["spiresLabel"], any=True)
         if spires is not None:
             self.spires = spires
 
         self.regex = []
         for label in store.objects(subject, namespace["altLabel"]):
             pattern = get_regex_pattern(label)
             self.regex.append(re.compile(CFG_BIBCLASSIFY_WORD_WRAP % pattern))
 
     def __repr__(self):
         return "".join(["<CompositeKeyword: ", self.concept, ">"])
 
 def build_cache(ontology_file, no_cache=False):
     """Builds the cached data by parsing the RDF ontology file."""
     if rdflib.__version__ >= '2.3.2':
         store = rdflib.ConjunctiveGraph()
     else:
         store = rdflib.Graph()
     store.parse(ontology_file)
 
     namespace = rdflib.Namespace("http://www.w3.org/2004/02/skos/core#")
 
     single_count = 0
     composite_count = 0
     regex_count = 0
 
     for subject_object in store.subject_objects(namespace["prefLabel"]):
         # Keep only the single keywords.
         # FIXME: Remove or alter that condition in order to allow using
         # other ontologies that do not have this composite notion (such
         # as NASA-subjects.rdf)
         if not store.value(subject_object[0], namespace["compositeOf"],
             any=True):
             strsubject = str(subject_object[0]).split("#")[-1]
             _SKWS[strsubject] = SingleKeyword(store, namespace,
                 subject_object[0])
             single_count += 1
             regex_count += len(_SKWS[strsubject].regex)
 
     # Let's go through the composite keywords.
     for subject, pref_label in store.subject_objects(namespace["prefLabel"]):
         # Keep only the single keywords.
         if store.value(subject, namespace["compositeOf"], any=True):
             strsubject = str(subject).split("#")[-1]
             _CKWS[strsubject] = CompositeKeyword(store, namespace, subject)
             composite_count += 1
             regex_count += len(_CKWS[strsubject].regex)
 
     store.close()
 
     cached_data = {}
     cached_data["single"] = _SKWS
     cached_data["composite"] = _CKWS
 
     if not no_cache:
         # Serialize
         try:
             filestream = open(get_cache_file(ontology_file), "w")
             cPickle.dump(cached_data, filestream, 1)
             filestream.close()
         except IOError:
             # Impossible to write the cache.
             return cached_data
 
     return cached_data
 
 def capitalize_first_letter(word):
     """Returns a regex pattern with the first letter accepting both lowercase
     and uppercase."""
     if word[0].isalpha():
         # These two cases are necessary in order to get a regex pattern
         # starting with '[xX]' and not '[Xx]'. This allows to check for
         # colliding regex afterwards.
         if word[0].isupper():
             return "["+ word[0].swapcase() + word[0] +"]" + word[1:]
         else:
             return "["+ word[0] + word[0].swapcase() +"]" + word[1:]
     return word
 
 def convert_punctuation(punctuation, conversion_table):
     """Returns a regular expression for a punctuation string."""
     if punctuation in conversion_table:
         return conversion_table[punctuation]
     return re.escape(punctuation)
 
 def convert_word(word):
     """Returns the plural form of the word if it exists, the word itself
     otherwise."""
     out = None
 
     # Acronyms.
     if word.isupper():
         out = word + "s?"
     # Proper nouns or word with digits.
     elif word.istitle():
         out = word + "('?s)?"
     elif _contains_digit.search(word):
         out = word
 
     if out is not None:
         return out
 
     # Words with non or anti prefixes.
     if _starts_with_non.search(word):
         word = "non-?" + capitalize_first_letter(convert_word(word[3:]))
     elif _starts_with_anti.search(word):
         word = "anti-?" + capitalize_first_letter(convert_word(word[4:]))
 
     if out is not None:
         return capitalize_first_letter(out)
 
     # A few invariable words.
     if word in CFG_BIBCLASSIFY_INVARIABLE_WORDS:
         return capitalize_first_letter(word)
 
     # Some exceptions that would not produce good results with the set of
     # general_regular_expressions.
     if word in CFG_BIBCLASSIFY_EXCEPTIONS:
         return capitalize_first_letter(CFG_BIBCLASSIFY_EXCEPTIONS[word])
 
     for regex in CFG_BIBCLASSIFY_UNCHANGE_REGULAR_EXPRESSIONS:
         if regex.search(word) is not None:
             return capitalize_first_letter(word)
 
     for regex, replacement in CFG_BIBCLASSIFY_GENERAL_REGULAR_EXPRESSIONS:
         stemmed = regex.sub(replacement, word)
         if stemmed != word:
             return capitalize_first_letter(stemmed)
 
     return capitalize_first_letter(word + "s?")
 
 def get_cache(ontology_file):
     """Get the cached ontology using the cPickle module. No check is done at
     that stage."""
     filestream = open(get_cache_file(ontology_file), "r")
     try:
         cached_data = cPickle.load(filestream)
     except (cPickle.UnpicklingError, AttributeError, DeprecationWarning):
         print >> sys.stderr, "Problem with existing cache. Regenerating."
         filestream.close()
         os.remove(get_cache_file(ontology_file))
         return build_cache(ontology_file)
     filestream.close()
 
     global _SKWS, _CKWS
     _SKWS = cached_data["single"]
     _CKWS = cached_data["composite"]
 
     return cached_data
 
 def get_cache_file(ontology_file):
     """Returns the file name of the cached ontology."""
     temp_dir = tempfile.gettempdir()
     cache_file = os.path.basename(ontology_file) + ".db"
     return os.path.join(temp_dir, cache_file)
 
 def get_keywords_from_text(text_lines, ontology_file="", output_mode="text",
                            output_limit=CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER,
                            spires=False, match_mode="full", no_cache=False,
                            with_author_keywords=False):
     """Returns a formatted string containing the keywords for a single
     document. If 'ontology_file' has not been specified, the method
     'get_regular_expressions' must have been run in order to build or
     get the cached ontology."""
     if not ontology_file:
         if not _SKWS or not _CKWS:
             # Cache was not read/created.
             print >> sys.stderr, ("Please specify an ontology file or "
                 "use the method 'get_regular_expressions' before "
                 "searching for keywords.")
             sys.exit(1)
     else:
         get_regular_expressions(ontology_file, no_cache)
 
     text_lines = cut_references(text_lines)
     fulltext = normalize_fulltext("\n".join(text_lines))
 
     author_keywords = None
     if with_author_keywords:
         author_keywords = get_author_keywords(fulltext)
 
     if match_mode == "partial":
         fulltext = get_partial_text(fulltext)
 
     single_keywords = get_single_keywords(_SKWS, fulltext)
 
     composite_keywords = get_composite_keywords(_CKWS, fulltext,
                                                 single_keywords)
 
     return get_keywords_output(single_keywords, composite_keywords,
         author_keywords, output_mode, output_limit, spires)
 
 def get_keywords_output(single_keywords, composite_keywords,
                         author_keywords=None, style="text",
                         output_limit=0, spires=False):
     """Returns a formatted string representing the keywords according
     to the style chosen."""
 
     # Filter the "nonstandalone" keywords
     single_keywords = filter_nostandalone(single_keywords)
 
     # Limit the number of keywords to nkeywords.
     single_keywords = resize_keywords_for_output(single_keywords,
         output_limit, single=True)
     composite_keywords = resize_keywords_for_output(composite_keywords,
         output_limit, composite=True)
 
     if style == "text":
         return output_text(single_keywords, composite_keywords,
             author_keywords, spires)
     elif style == "marcxml":
         return output_marc(single_keywords, composite_keywords, spires)
     elif style == "html":
         return output_html(single_keywords, composite_keywords, spires)
 
 def get_partial_text(fulltext):
     """Returns a shortened version of the fulltext used with the partial
     matching mode. The version is composed of 20% in the beginning and
     20% in the middle of the text."""
     length = len(fulltext)
 
     get_index = lambda x: int(float(x) / 100 * length)
 
     partial_text = [fulltext[get_index(start):get_index(end)]
                     for start, end in CFG_BIBCLASSIFY_PARTIAL_TEXT]
 
     return "\n".join(partial_text)
 
 def get_regular_expressions(ontology_file, rebuild=False, no_cache=False):
     """Returns a list of patterns compiled from the RDF/SKOS taxonomy.
     Uses cache if it exists and if the taxonomy hasn't changed."""
     if os.access(ontology_file, os.R_OK):
         if rebuild or no_cache:
             return build_cache(ontology_file, no_cache)
 
         if os.access(get_cache_file(ontology_file), os.R_OK):
             if (os.path.getmtime(get_cache_file(ontology_file)) >
                os.path.getmtime(ontology_file)):
                 # Cache is more recent than the ontology: use cache.
                 return get_cache(ontology_file)
             else:
                 # Ontology is more recent than the cache: rebuild cache.
                 return build_cache(ontology_file, no_cache)
         else:
             # Cache does not exist. Build cache.
             return build_cache(ontology_file, no_cache)
     else:
         if os.access(get_cache_file(ontology_file), os.R_OK):
             # Ontology file not found. Use the cache instead.
             return get_cache(ontology_file)
         else:
             # Cannot access the ontology nor the cache. Exit.
             print >> sys.stderr, "Neither ontology file nor cache can be read."
             sys.exit(-1)
             return None
 
 def get_searchable_regex(basic_labels, hidden_labels):
     """Returns the searchable regular expressions for the single
     keyword."""
     # Hidden labels are used to store regular expressions.
     hidden_regex_dict = {}
     for hidden_label in hidden_labels:
         if is_regex(hidden_label):
             hidden_regex_dict[hidden_label] = \
                 re.compile(CFG_BIBCLASSIFY_WORD_WRAP % hidden_label[1:-1])
         else:
             pattern = get_regex_pattern(hidden_label)
             hidden_regex_dict[hidden_label] = \
                 re.compile(CFG_BIBCLASSIFY_WORD_WRAP % pattern)
 
     # We check if the basic label (preferred or alternative) is matched
     # by a hidden label regex. If yes, discard it.
     regex_dict = {}
     # Create regex for plural forms and add them to the hidden labels.
     for label in basic_labels:
         pattern = get_regex_pattern(label)
         regex_dict[label] = re.compile(CFG_BIBCLASSIFY_WORD_WRAP % pattern)
 
     # Merge both dictionaries.
     regex_dict.update(hidden_regex_dict)
 
     return regex_dict.values()
 
 def get_regex_pattern(label):
     """Returns a regular expression of the label that takes care of
     plural and different kinds of separators."""
     parts = _split_by_punctuation.split(label)
 
     for index, part in enumerate(parts):
         if index % 2 == 0:
             # Word
             if not parts[index].isdigit():
                 parts[index] = convert_word(parts[index])
         else:
             # Punctuation
             if not parts[index + 1]:
                 # The separator is not followed by another word. Treat
                 # it as a symbol.
                 parts[index] = convert_punctuation(parts[index],
                     CFG_BIBCLASSIFY_SYMBOLS)
             else:
                 parts[index] = convert_punctuation(parts[index],
                     CFG_BIBCLASSIFY_SEPARATORS)
 
     return "".join(parts)
 
 def is_regex(string):
     """Checks if a concept is a regular expression."""
     return string[0] == "/" and string[-1] == "/"
 
 def output_html(single_keywords, composite_keywords, spires=False):
     """Using the counts for each of the tags, write a simple HTML page
     to standard output containing a tag cloud representation. The CSS
     describes ten levels, each of which has differing font-size's,
     line-height's and font-weight's."""
 
     lines = []
     lines.append('''<html>
   <head>
     <title>Keyword Cloud</title>
     <style type="text/css">
       <!--
         a { color: #003DF5; text-decoration: none; }
         a:hover { color: #f1f1f1; text-decoration: none;
           background-color: #003DF5; }
         .pagebox { color: #000; margin-left: 1em; margin-bottom: 1em;
           border: 1px solid #000; padding: 1em;
           background-color: #f1f1f1; font-family: arial, sans-serif;
           max-width: 700px; margin: 10px; padding-left: 10px;
           float: left; }
         .pagebox1 { color: #B5B5B5; margin-left: 1em;
           margin-bottom: 1em; border: 1px dotted #B5B5B5;
           padding: 1em; background-color: #f2f2f2;
           font-family: arial, sans-serif; max-width: 300px;
           margin: 10px; padding-left: 10px; float: left; }
         .pagebox2 { color: #000; margin-left: 1em; margin-bottom: 1em;
           border: 0px solid #000; padding: 1em; font-size: x-small;
           font-family: arial, sans-serif; margin: 10px;
           padding-left: 10px; float: left; }''')
 
     level = (
 '''        .level%d { color:#003DF5; font-size:%dpx; line-height:%dpx;
           font-weight:bold; }''')
 
     for index, size in enumerate(range(12, 40, 3)):
         lines.append(level % (index, size, size + 5))
 
     level_list = (10, 7.5, 5, 4, 3, 2, 1.7, 1.5, 1.3, 1)
     keyword = ('          <span class="level%d" style="color:%s !important">'
         '%s </span>')
 
     lines.append("      -->")
     lines.append("    </style>")
     lines.append("  </head>")
     lines.append("  <body>")
     lines.append("    <table>")
     lines.append("      <tr>")
     lines.append('        <div class="pagebox" align="top" />')
 
     tags = []
 
     max_counts = [len(single_keywords[0][1]), composite_keywords[0][1]]
 
     # Add the single tags
     color = "#b5b5b5"
     for subject, spans in single_keywords:
         for index, value in enumerate(level_list):
             if len(spans) <= max_counts[0] / value:
                 if spires:
                     obj = spires_label(subject)
                 else:
                     obj = _SKWS[subject].concept
                 obj = obj.replace(" ", "&#160")
                 tags.append(keyword % (index, color, obj))
                 break
 
     # Add the composite tags
     color = "#003df5"
     for subject, count, components in composite_keywords:
         for index, value in enumerate(level_list):
             if count <= max_counts[1] / value:
                 if spires:
                     obj = spires_label(subject)
                 else:
                     obj = _CKWS[subject].concept
                 obj = obj.replace(" ", "&#160")
                 tags.append(keyword % (index, color, obj))
                 break
 
     # Appends the keywords in a random way (in order to create the cloud
     # effect)
     while tags:
         index = random.randint(0, len(tags) - 1)
         lines.append(tags[index])
         tags[index] = tags[-1]
         del tags[-1]
 
     lines.append(" " * 8 + "</div>")
     lines.append(" " * 6 + "</tr>")
     lines.append(" " * 4 + "</table>")
     lines.append(" " * 2 + "</body>")
     lines.append("</html>")
 
     return "\n".join(lines)
 
 def output_marc(single_keywords, composite_keywords, spires=False):
     """Outputs the keywords in the MARCXML format."""
     marc_pattern = ('<datafield tag="653" ind1="1" ind2=" ">\n'
                     '    <subfield code="a">%s</subfield>\n'
                     '    <subfield code="9">BibClassify/HEP</subfield>\n'
                     '</datafield>\n')
 
     output = []
 
     for subject, spans in single_keywords:
         if spires:
             output.append(spires_label(subject))
         else:
             output.append(_SKWS[subject].concept)
 
     for subject, count, components in composite_keywords:
         if spires:
             output.append(spires_label(subject))
         else:
             output.append(_CKWS[subject].concept)
 
     return "".join([marc_pattern % keyword for keyword in output])
 
 def output_text(single_keywords=None, composite_keywords=None,
                 author_keywords=None, spires=False):
     """Outputs the results obtained in text format."""
     output = []
 
     if author_keywords is not None:
         output.append("\n\nExplicit keywords:")
         for keyword in author_keywords:
             output.append(keyword)
 
     if composite_keywords is not None:
         output.append("\n\nComposite keywords:")
         for subject, count, components in composite_keywords:
             if spires:
                 concept = spires_label(subject)
             else:
                 concept = _CKWS[subject].concept
             output.append("%d  %s %s" % (count, concept, components))
 
     if single_keywords is not None:
         output.append("\n\nSingle keywords:")
         for subject, spans in single_keywords:
             if spires:
                 concept = spires_label(subject)
             else:
                 concept = _SKWS[subject].concept
             output.append("%d  %s" % (len(spans), concept))
 
     return "\n".join(output) + "\n"
 
 def check_ontology(ontology_file):
     """Checks the consistency of the ontology and outputs a list of
     errors and warnings."""
     print "Building graph with Python RDFLib version %s" % rdflib.__version__
     if rdflib.__version__ >= '2.3.2':
         store = rdflib.ConjunctiveGraph()
     else:
         store = rdflib.Graph()
 
     store.parse(ontology_file)
 
     print "Graph was successfully built."
 
     prefLabel = "prefLabel"
     hiddenLabel = "hiddenLabel"
     altLabel = "altLabel"
     composite = "composite"
     compositeOf = "compositeOf"
     note = "note"
 
     both_skw_and_ckw = []
 
     # Build a dictionary we will reason on later.
     uniq_subjects = {}
     for subject in store.subjects():
         uniq_subjects[subject] = None
 
     subjects = {}
     for subject in uniq_subjects:
         strsubject = str(subject).split("#Composite.")[-1]
         strsubject = strsubject.split("#")[-1]
         if (strsubject == "http://cern.ch/thesauri/HEPontology.rdf" or
             strsubject == "compositeOf"):
             continue
         components = {}
         for predicate, value in store.predicate_objects(subject):
             strpredicate = str(predicate).split("#")[-1]
             strobject = str(value).split("#Composite.")[-1]
             strobject = strobject.split("#")[-1]
             components.setdefault(strpredicate, []).append(strobject)
         if strsubject in subjects:
             both_skw_and_ckw.append(strsubject)
         else:
             subjects[strsubject] = components
 
     print "Ontology contains %s concepts." % len(subjects)
 
     no_prefLabel = []
     multiple_prefLabels = []
     multiple_notes = []
     bad_notes = []
     # Subjects with no composite or compositeOf predicate
     lonely = []
     both_composites = []
     bad_hidden_labels = {}
     bad_alt_labels = {}
     # Problems with composite keywords
     composite_problem1 = []
     composite_problem2 = []
     composite_problem3 = []
     composite_problem4 = {}
     composite_problem5 = []
     composite_problem6 = []
 
     stemming_collisions = []
     interconcept_collisions = {}
 
     for subject, predicates in subjects.iteritems():
         # No prefLabel or multiple prefLabels
         try:
             if len(predicates[prefLabel]) > 1:
                 multiple_prefLabels.append(subject)
         except KeyError:
             no_prefLabel.append(subject)
 
         # Lonely and both composites.
         if not composite in predicates and not compositeOf in predicates:
             lonely.append(subject)
         elif composite in predicates and compositeOf in predicates:
             both_composites.append(subject)
 
         # Multiple or bad notes
         if note in predicates:
             if len(predicates[note]) > 1:
                 multiple_notes.append(subject)
             bad_notes += [(subject, n) for n in predicates[note]
                           if n != "nostandalone"]
 
         # Bad hidden labels
         if hiddenLabel in predicates:
             for lbl in predicates[hiddenLabel]:
                 if lbl.startswith("/") ^ lbl.endswith("/"):
                     bad_hidden_labels.setdefault(subject, []).append(lbl)
 
         # Bad alt labels
         if altLabel in predicates:
             for lbl in predicates[altLabel]:
                 if len(re.findall("/", lbl)) >= 2 or ":" in lbl:
                     bad_alt_labels.setdefault(subject, []).append(lbl)
 
         # Check composite
         if composite in predicates:
             for ckw in predicates[composite]:
                 if ckw in subjects:
                     if compositeOf in subjects[ckw]:
                         if not subject in subjects[ckw][compositeOf]:
                             composite_problem3.append((subject, ckw))
                     else:
                         if not ckw in both_skw_and_ckw:
                             composite_problem2.append((subject, ckw))
                 else:
                     composite_problem1.append((subject, ckw))
 
         # Check compositeOf
         if compositeOf in predicates:
             for skw in predicates[compositeOf]:
                 if skw in subjects:
                     if composite in subjects[skw]:
                         if not subject in subjects[skw][composite]:
                             composite_problem6.append((subject, skw))
                     else:
                         if not skw in both_skw_and_ckw:
                             composite_problem5.append((subject, skw))
                 else:
                     composite_problem4.setdefault(skw, []).append(subject)
 
         # Check for stemmed labels
         if compositeOf in predicates:
             labels = (altLabel, hiddenLabel)
         else:
             labels = (prefLabel, altLabel, hiddenLabel)
 
         patterns = {}
         for label in [lbl for lbl in labels if lbl in predicates]:
             for expression in [expr for expr in predicates[label]
                                     if not is_regex(expr)]:
                 pattern = get_regex_pattern(expression)
                 interconcept_collisions.setdefault(pattern,
                     []).append((subject, label))
                 if pattern in patterns:
                     stemming_collisions.append((subject,
                         patterns[pattern],
                         (label, expression)
                         ))
                 else:
                     patterns[pattern] = (label, expression)
 
     print "\n==== ERRORS ===="
 
     if no_prefLabel:
         print "\nConcepts with no prefLabel: %d" % len(no_prefLabel)
         print "\n".join(["   %s" % subj for subj in no_prefLabel])
     if multiple_prefLabels:
         print ("\nConcepts with multiple prefLabels: %d" %
             len(multiple_prefLabels))
         print "\n".join(["   %s" % subj for subj in multiple_prefLabels])
     if both_composites:
         print ("\nConcepts with both composite properties: %d" %
             len(both_composites))
         print "\n".join(["   %s" % subj for subj in both_composites])
     if bad_hidden_labels:
         print "\nConcepts with bad hidden labels: %d" % len(bad_hidden_labels)
         for kw, lbls in bad_hidden_labels.iteritems():
             print "   %s:" % kw
             print "\n".join(["      '%s'" % lbl for lbl in lbls])
     if bad_alt_labels:
         print "\nConcepts with bad alt labels: %d" % len(bad_alt_labels)
         for kw, lbls in bad_alt_labels.iteritems():
             print "   %s:" % kw
             print "\n".join(["      '%s'" % lbl for lbl in lbls])
     if both_skw_and_ckw:
         print ("\nKeywords that are both skw and ckw: %d" %
             len(both_skw_and_ckw))
         print "\n".join(["   %s" % subj for subj in both_skw_and_ckw])
 
     print
 
     if composite_problem1:
         print "\n".join(["SKW '%s' references an unexisting CKW '%s'." %
             (skw, ckw) for skw, ckw in composite_problem1])
     if composite_problem2:
         print "\n".join(["SKW '%s' references a SKW '%s'." %
             (skw, ckw) for skw, ckw in composite_problem2])
     if composite_problem3:
         print "\n".join(["SKW '%s' is not composite of CKW '%s'." %
             (skw, ckw) for skw, ckw in composite_problem3])
     if composite_problem4:
         for skw, ckws in composite_problem4.iteritems():
             print "SKW '%s' does not exist but is " "referenced by:" % skw
             print "\n".join(["    %s" % ckw for ckw in ckws])
     if composite_problem5:
         print "\n".join(["CKW '%s' references a CKW '%s'." % kw
             for kw in composite_problem5])
     if composite_problem6:
         print "\n".join(["CKW '%s' is not composed by SKW '%s'." % kw
             for kw in composite_problem6])
 
     print "\n==== WARNINGS ===="
 
     if multiple_notes:
         print "\nConcepts with multiple notes: %d" % len(multiple_notes)
         print "\n".join(["   %s" % subj for subj in multiple_notes])
     if bad_notes:
         print ("\nConcepts with bad notes: %d" % len(bad_notes))
         print "\n".join(["   '%s': '%s'" % note for note in bad_notes])
     if stemming_collisions:
         print ("\nFollowing keywords have unnecessary labels that have "
             "already been generated by BibClassify.")
         for subj in stemming_collisions:
             print "   %s:\n     %s\n     and %s" % subj
 
     print "\nFinished."
     sys.exit(0)
 
 def filter_nostandalone(keywords):
     """Returns a copy of the keywords data structure stripped from its
     nonstandalone components."""
     filtered_keywords = {}
 
     for subject, spans in keywords.iteritems():
         if not _SKWS[subject].nostandalone:
             filtered_keywords[subject] = spans
 
     return filtered_keywords
 
 def compare_skw(skw0, skw1):
     """Compare 2 single keywords records. First compare the
     occurrences, then the length of the word."""
     list_comparison = cmp(len(skw1[1]), len(skw0[1]))
     if list_comparison:
         return list_comparison
     else:
         return cmp(len(skw1[0]), len(skw0[0]))
 
 def compare_ckw(ckw0, ckw1):
     """Compare 2 composite keywords records. First compare the
     occurrences, then the length of the word, at last the component
     counts."""
     count_comparison = cmp(ckw1[1], ckw0[1])
     if count_comparison:
         return count_comparison
     component_avg0 = sum(ckw0[2]) / len(ckw0[2])
     component_avg1 = sum(ckw1[2]) / len(ckw1[2])
     component_comparison =  cmp(component_avg1, component_avg0)
     if component_comparison:
         return component_comparison
     else:
         return cmp(len(ckw1[0]), len(ckw0[0]))
 
 def resize_keywords_for_output(keywords, limit=20, single=False,
                                composite=False):
     """Returns a resized version of data structures of keywords to the
     given length.  This method takes care of the 'nonstandalone' option
     of the keywords. The single keywords with this option set are
     removed from the dictionary."""
     if not (single ^ composite):
         print >> sys.stderr, "Problem in resize_keywords_for_output."
         sys.exit(1)
 
     if single:
         keywords = list(keywords.items())
         keywords.sort(compare_skw)
     elif composite:
         keywords.sort(compare_ckw)
 
     if limit:
         return keywords[:limit]
     else:
         return keywords
 
 def spires_label(subject):
     """Returns the SPIRES representation of a keyword. If the
     spiresLabel is set, then it returns that value otherwise it replaces
     the colon in the prefLabel by a comma."""
     try:
         if subject in _SKWS:
             return _SKWS[subject].spires
     except AttributeError:
         # The keyword doesn't have a SPIRES label.
         return _SKWS[subject].concept
 
     try:
         return _CKWS[subject].spires
     except AttributeError:
         # The keyword doesn't have a SPIRES label. Build "comp1, comp2".
         components = _CKWS[subject].compositeof
         spires_labels = [spires_label(component) for component in components]
         return ", ".join(spires_labels)
 
 if __name__ == "__main__":
     print >> sys.stderr, "Please use bibclassifycli from now on."