config.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Wed, Dec 11, 17:45

config.py
View Options

	# -- coding: utf-8 --
	##
	## This file is part of Invenio.
	## Copyright (C) 2007, 2008, 2009, 2010, 2011, 2013, 2014 CERN.
	##
	## Invenio is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 2 of the
	## License, or (at your option) any later version.
	##
	## Invenio is distributed in the hope that it will be useful, but
	## WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	## General Public License for more details.
	##
	## You should have received a copy of the GNU General Public License
	## along with Invenio; if not, write to the Free Software Foundation, Inc.,
	## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.

	"""
	BibClassify configuration file.

	When writing changes, please either delete the cached ontology in your
	temporary directory or use the rebuild-cache option in order to
	regenerate the cached ontology.

	If you want to change this configuration, we recommend to create a
	local configuration file names 'bibclassify_config_local.py' that
	contains the changes to apply.
	"""

	import re
	import logging
	import sys
	import os
	from invenio import config

	VERSION = '0.4.9'

	logging_level = logging.ERROR


	# ------------- main config -----------

	# Save generated kw into the database?
	# daemon does that
	CFG_DB_SAVE_KW = True

	# Number of keywords that are printed by default (this limits single keywords,
	# composite keywords, and acronyms - not author keywords)
	CFG_BIBCLASSIFY_DEFAULT_OUTPUT_NUMBER = 20

	# The main marc xml field where to find/save the keywords, including the
	# indicators
	CFG_MAIN_FIELD = '6531_'

	# Other fields to take from the marc xml when generating tagcloud/list of
	# keywords.
	CFG_OTHER_FIELDS = ['6950_']

	# Where to save author supplied keywords
	CFG_AUTH_FIELD = ''

	# Where to save extracted acronyms
	CFG_ACRON_FIELD = ''

	# ------------ bibclass config -------

	# PARTIAL_TEXT
	# Marks the part of the fulltext to keep when running a partial match.
	# Each tuple contains the start and end percentages of a section.
	CFG_BIBCLASSIFY_PARTIAL_TEXT = ((0, 20), (40, 60))


	# Format and output marcxml records in spires format
	CFG_SPIRES_FORMAT = False


	# The taxonomy used when no taxonomy is specified
	CFG_EXTRACTION_TAXONOMY = 'HEP'


	# WORD TRANSFORMATIONS

	# BibClassify creates a regular expression for each label found in the
	# ontology.
	# If the keyword belongs in 'INVARIABLE_WORDS', we return it whitout any
	# change.
	# If the keyword is found in 'EXCEPTIONS', we return its attached
	# regular expression.
	# If the keyword is matched by a regular expression of
	# 'UNCHANGE_REGULAR_EXPRESSIONS', we return the keyword without any
	# change.
	# At last, we perform the sub method of Python's re module using the
	# first element of the tuple as the regex and the second element as the
	# replacement string.

	# Regular expressions found here have been originally based on
	# Wikipedia's page on English plural.
	# [http://en.wikipedia.org/wiki/English_plural]

	CFG_BIBCLASSIFY_INVARIABLE_WORDS = ("any", "big", "chi", "der", "eta", "few",
	"low", "new", "non", "off", "one", "out",
	"phi", "psi", "rho", "tau",
	"two", "van", "von", "hard", "weak", "four",
	"anti", "zero", "sinh",
	"open", "high", "data", "dark", "free",
	"flux", "fine", "final", "heavy",
	"strange")

	CFG_BIBCLASSIFY_EXCEPTIONS = {
	"aluminium": r"alumini?um",
	"aluminum": r"alumini?um",
	"analysis": r"analy[sz]is",
	"analyzis": r"analy[sz]is",
	"behavior": r"behaviou?rs?",
	"behaviour": r"behaviou?rs?",
	"color": r"colou?rs?",
	"colour": r"colou?rs?",
	"deflexion": r"defle(x\|ct)ions?",
	"flavor": r"flavou?rs?",
	"flavour": r"flavou?rs?",
	"gas": r"gas(s?es)?",
	"lens": r"lens(es)?",
	"matrix": r"matri(x(es)?\|ces)",
	"muon": r"muons?",
	"neutrino": r"neutrinos?",
	"reflexion": r"refle(x\|ct)ions?",
	"ring": r"rings?",
	"status": r"status(es)?",
	"string": r"strings?",
	"sum": r"sums?",
	"vertex": r"vert(ex(es)?\|ices)",
	"vortex": r"vort(ex(es)?\|ices)",
	}

	CFG_BIBCLASSIFY_UNCHANGE_REGULAR_EXPRESSIONS = (
	re.compile("[^e]ed$"),
	re.compile("ics?$"),
	re.compile("[io]s$"),
	re.compile("ium$"),
	re.compile("less$"),
	re.compile("ous$"),
	)

	# IDEAS
	# "al$" -> "al(ly)?"

	CFG_BIBCLASSIFY_GENERAL_REGULAR_EXPRESSIONS = (
	(re.compile("ional"), r"ional(ly)?"),
	(re.compile("([ae])n(ce\|t)$"), r"\1n(t\|ces?)"),
	(re.compile("og(ue)?$"), r"og(ue)?s?"),
	(re.compile("([^aeiouyc])(re\|er)$"), r"\1(er\|re)s?"),
	(re.compile("([aeiouy])[sz]ation$"), r"\1[zs]ations?"),
	(re.compile("([aeiouy])[sz]ation$"), r"\1[zs]ations?"),
	(re.compile("([^aeiou])(y\|ies)$"), r"\1(y\|ies)"),
	(re.compile("o$"), r"o(e?s)?"),
	(re.compile("(x\|sh\|ch\|ss)$"), r"\1(es)?"),
	(re.compile("f$"), r"(f\|ves)"),
	(re.compile("ung$"), r"ung(en)?"),
	(re.compile("([^aiouy])s$"), r"\1s?"),
	(re.compile("([^o])us$"), r"\1(i\|us(es)?)"),
	(re.compile("um$"), r"(a\|ums?)"),
	)

	# PUNCTUATION TRANSFORMATIONS

	# When building the regex pattern for each label of the ontology, ew also take
	# care of the non-alpha characters. Thereafter are two sets of transformations.
	# 'SEPARATORS' contains the transformation for the non-alpha characters that
	# can be found between two words.
	# 'SYMBOLS' contains punctuation that can be found at the end of a word.
	# In both cases, it the separator is not found in the dictionaries, we return
	# re.escape(separator)

	CFG_BIBCLASSIFY_SEPARATORS = {
	" ": r"[\s\n-]",
	"-": r"[\s\n-]?",
	"/": r"[/\s]?",
	"(": r"\s?\(",
	"": r"[\s]?",
	"- ": r"\s?\-\s",
	"+ ": r"\s?\+\s",
	}

	CFG_BIBCLASSIFY_SYMBOLS = {
	"'": r"\s?\'",
	}

	CFG_BIBCLASSIFY_WORD_WRAP = "[^\w-]%s[^\w-]"

	# MATCHING

	# When searching for composite keywords, we allow two keywords separated by one
	# of the component of 'VALID_SEPARATORS' to form a composite keyword. These
	# separators contain also the punctuation.

	CFG_BIBCLASSIFY_VALID_SEPARATORS = (
	"of", "of a", "of an", "of the", "of this", "of one", "of two", "of three",
	"of new", "of other", "of many", "of both", "of these", "of each", "is",
	"the"
	)

	# AUTHOR KEYWORDS

	# When looking for the keywords already defined in the document, we run the
	# following set of regex.

	CFG_BIBCLASSIFY_AUTHOR_KW_START = \
	re.compile(r"(?i)key[ -]words?[a-z ][.:] *")

	CFG_BIBCLASSIFY_AUTHOR_KW_END = (
	re.compile(r"\n"),
	re.compile(r"\.\W"),
	re.compile(r"\sPACS"),
	re.compile(r"(?i)1[. ]*introduction\W"),
	re.compile(r"(?i)mathematics subject classification\W"),
	)

	CFG_BIBCLASSIFY_AUTHOR_KW_SEPARATION = re.compile(" ?; ?\| ?, ?\| ?- ")


	# Modules to call to get output from them
	#CFG_EXTERNAL_MODULES = {'webtag' : 'call_from_outside'}
	CFG_EXTERNAL_MODULES = {}

	log = None
	_loggers = []


	def get_logger(name):
	"""Creates a logger for you - with the parent newseman logger and
	common configuration"""
	if log:
	logger = log.manager.getLogger(name)
	else:
	logger = logging.getLogger(name)
	hdlr = logging.StreamHandler(sys.stderr)
	formatter = logging.Formatter(
	'%(levelname)s %(name)s:%(lineno)d %(message)s')
	hdlr.setFormatter(formatter)
	logger.addHandler(hdlr)
	logger.setLevel(logging_level)
	logger.propagate = 0
	if logger not in _loggers:
	_loggers.append(logger)
	return logger


	def set_global_level(level):
	global logging_level
	logging_level = int(level)
	for l in _loggers:
	l.setLevel(logging_level)


	log = get_logger('bibclassify')

	STANDALONE = False
	## Standalone mode has been removed.
	#try:
	# import invenio.legacy.search_engine
	#except:
	# STANDALONE = True
	# log.warning('Bibclassify is running in a standalone mode, access to database is not supported')


	if STANDALONE:
	import tempfile
	# try to find etcdir (first in this directory), and set etc to be one
	# level higher
	etcdir = ' '
	bibetc = os.path.join(os.path.dirname(__file__), 'bibclassify')
	if os.path.isdir(bibetc) and os.access(bibetc, os.W_OK):
	etcdir = os.path.dirname(__file__)

	if not os.path.isdir(etcdir) or not os.access(etcdir, os.W_OK):
	etcdir = os.path.abspath(
	os.path.join(os.path.dirname(__file__), "../../../etc"))
	if not os.path.isdir(etcdir) or not os.access(etcdir, os.W_OK):
	etcdir = tempfile.gettempdir()

	log.warning("Setting CFG_CACHEDIR, CFG_WEBDIR, CFG_ETCDIR to: %s" % etcdir)

	# override a few special paths
	config.CFG_CACHEDIR = etcdir
	config.CFG_WEBDIR = etcdir
	config.CFG_ETCDIR = etcdir


	# shadow the config variables that bibclassify modules use
	CFG_PREFIX = config.CFG_PREFIX
	CFG_CACHEDIR = config.CFG_CACHEDIR
	CFG_WEBDIR = config.CFG_WEBDIR
	CFG_ETCDIR = config.CFG_ETCDIR
	CFG_TMPDIR = config.CFG_TMPDIR


	# Redefine variable definitions if local config exists
	try:
	from invenio import bibclassify_config_local as localconf

	for confid in dir(localconf):
	if 'CFG' in confid:
	if hasattr(config, confid):
	log.info('Overriding global config %s with %s' % (
	confid, getattr(localconf, confid)))
	setattr(config, confid, getattr(localconf, confid))
	if confid in globals():
	globals()[confid] = getattr(localconf, confid)
	log.info('Overriding bibclassify config %s with %s' % (
	confid, getattr(localconf, confid)))
	except ImportError:
	# No local configuration was found.
	pass

	log.info('Initialized bibclassify config')

config.pyNo OneTemporaryActions

File Metadata

config.pyView Options

Event Timeline

config.py
No OneTemporary
Actions

config.py
View Options