diff --git a/invenio/legacy/bibindex/engine_utils.py b/invenio/legacy/bibindex/engine_utils.py index 845a6c4d3..54671c77d 100644 --- a/invenio/legacy/bibindex/engine_utils.py +++ b/invenio/legacy/bibindex/engine_utils.py @@ -1,555 +1,555 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. # Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """bibindex.engine_utils: here are some useful regular experssions for tokenizers and several helper functions. """ import re import sys from invenio.base.helpers import utf8ifier from invenio.legacy.dbquery import run_sql, \ DatabaseError from invenio.legacy.bibsched.bibtask import write_message from invenio.legacy.bibrecord import get_fieldvalues from invenio.config import \ CFG_BIBINDEX_CHARS_PUNCTUATION, \ CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS from invenio.legacy.bibindex.engine_config import CFG_BIBINDEX_COLUMN_VALUE_SEPARATOR latex_formula_re = re.compile(r'\$.*?\$|\\\[.*?\\\]') phrase_delimiter_re = re.compile(r'[\.:;\?\!]') space_cleaner_re = re.compile(r'\s+') re_block_punctuation_begin = re.compile(r"^" + CFG_BIBINDEX_CHARS_PUNCTUATION + "+") re_block_punctuation_end = re.compile(CFG_BIBINDEX_CHARS_PUNCTUATION + "+$") re_punctuation = re.compile(CFG_BIBINDEX_CHARS_PUNCTUATION) re_separators = re.compile(CFG_BIBINDEX_CHARS_ALPHANUMERIC_SEPARATORS) re_arxiv = re.compile(r'^arxiv:\d\d\d\d\.\d\d\d\d') re_pattern_fuzzy_author_trigger = re.compile(r'[\s\,\.]') # FIXME: re_pattern_fuzzy_author_trigger could be removed and an # BibAuthorID API function could be called instead after we # double-check that there are no circular imports. def load_tokenizers(): """ Load all the bibindex tokenizers and returns it. """ import warnings warnings.warn("The function is deprecated. Please use the " "`load_tokenizers()` from `invenio.modules.indexer.utils`", DeprecationWarning) from invenio.modules.indexer.registry import tokenizers return dict((module.__name__.split('.')[-1], getattr(module, module.__name__.split('.')[-1], '')) for module in tokenizers) def get_all_index_names_and_column_values(column_name): """Returns a list of tuples of name and another column of all defined words indexes. Returns empty list in case there are no tags indexed in this index or in case the column name does not exist. Example: output=[('global', something), ('title', something)].""" out = [] query = """SELECT name, %s FROM idxINDEX""" % column_name try: res = run_sql(query) for row in res: out.append((row[0], row[1])) except DatabaseError: write_message("Exception caught for SQL statement: %s; column %s might not exist" % (query, column_name), sys.stderr) return out def get_all_synonym_knowledge_bases(): """Returns a dictionary of name key and knowledge base name and match type tuple value information of all defined words indexes that have knowledge base information. Returns empty dictionary in case there are no tags indexed. Example: output['global'] = ('INDEX-SYNONYM-TITLE', 'exact'), output['title'] = ('INDEX-SYNONYM-TITLE', 'exact').""" res = get_all_index_names_and_column_values("synonym_kbrs") out = {} for row in res: kb_data = row[1] # ignore empty strings if len(kb_data): out[row[0]] = tuple(kb_data.split(CFG_BIBINDEX_COLUMN_VALUE_SEPARATOR)) return out def get_index_remove_stopwords(index_id): """Returns value of a remove_stopword field from idxINDEX database table if it's not 'No'. If it's 'No' returns False. Just for consistency with WordTable. @param index_id: id of the index """ try: result = run_sql("SELECT remove_stopwords FROM idxINDEX WHERE ID=%s", (index_id, ))[0][0] except: return False if result == 'No' or result == '': return False return result def get_index_remove_html_markup(index_id): """ Gets remove_html_markup parameter from database ('Yes' or 'No') and changes it to True, False. Just for consistency with WordTable.""" try: result = run_sql("SELECT remove_html_markup FROM idxINDEX WHERE ID=%s", (index_id, ))[0][0] except: return False if result == 'Yes': return True return False def get_index_remove_latex_markup(index_id): """ Gets remove_latex_markup parameter from database ('Yes' or 'No') and changes it to True, False. Just for consistency with WordTable.""" try: result = run_sql("SELECT remove_latex_markup FROM idxINDEX WHERE ID=%s", (index_id, ))[0][0] except: return False if result == 'Yes': return True return False def author_name_requires_phrase_search(p): """ Detect whether author query pattern p requires phrase search. Notably, look for presence of spaces and commas. """ if re_pattern_fuzzy_author_trigger.search(p): return True return False def get_field_count(recID, tags): """ Return number of field instances having TAGS in record RECID. @param recID: record ID @type recID: int @param tags: list of tags to count, e.g. ['100__a', '700__a'] @type tags: list @return: number of tags present in record @rtype: int @note: Works internally via getting field values, which may not be very efficient. Could use counts only, or else retrieve stored recstruct format of the record and walk through it. """ out = 0 for tag in tags: out += len(get_fieldvalues(recID, tag)) return out def run_sql_drop_silently(query): """ SQL DROP statement with IF EXISTS part generates warning if table does not exist. To mute the warning we can remove IF EXISTS and catch SQL exception telling us that table does not exist. """ try: query = query.replace(" IF EXISTS", "") run_sql(query) except Exception as e: if str(e).find("Unknown table") > -1: pass else: raise e from invenio.modules.indexer.utils import get_idx_indexer def get_all_indexes(virtual=True, with_ids=False): """Returns the list of the names of all defined words indexes. Returns empty list in case there are no tags indexed in this index. @param virtual: if True function will return also virtual indexes @param with_ids: if True function will return also IDs of found indexes Example: output=['global', 'author'].""" out = [] if virtual: query = """SELECT %s name FROM idxINDEX""" query = query % (with_ids and "id," or "") else: query = """SELECT %s w.name FROM idxINDEX AS w WHERE w.id NOT IN (SELECT DISTINCT id_virtual FROM idxINDEX_idxINDEX)""" query = query % (with_ids and "w.id," or "") res = run_sql(query) if with_ids: out = [row for row in res] else: out = [row[0] for row in res] return out def get_all_virtual_indexes(): """ Returns all defined 'virtual' indexes. """ query = """SELECT DISTINCT v.id_virtual, w.name FROM idxINDEX_idxINDEX AS v, idxINDEX AS w WHERE v.id_virtual=w.id""" res = run_sql(query) return res def get_index_virtual_indexes(index_id): """Returns 'virtual' indexes that should be indexed together with given index.""" query = """SELECT v.id_virtual, w.name FROM idxINDEX_idxINDEX AS v, idxINDEX AS w WHERE v.id_virtual=w.id AND v.id_normal=%s""" res = run_sql(query, (index_id,)) return res def is_index_virtual(index_id): """Checks if index is virtual""" query = """SELECT id_virtual FROM idxINDEX_idxINDEX WHERE id_virtual=%s""" res = run_sql(query, (index_id,)) if res: return True return False def filter_for_virtual_indexes(index_list): """ Function removes all non-virtual indexes from given list of indexes. @param index_list: list of index names """ try: virtual = zip(*get_all_virtual_indexes())[1] selected = set(virtual) & set(index_list) return list(selected) except IndexError: return [] return [] def get_virtual_index_building_blocks(index_id): """Returns indexes that made up virtual index of given index_id. If index_id is an id of normal index (not virtual) returns empty tuple. """ query = """SELECT v.id_normal, w.name FROM idxINDEX_idxINDEX AS v, idxINDEX AS w WHERE v.id_normal=w.id AND v.id_virtual=%s""" res = run_sql(query, (index_id,)) return res def get_index_id_from_index_name(index_name): """Returns the words/phrase index id for INDEXNAME. Returns empty string in case there is no words table for this index. Example: field='author', output=4.""" out = 0 query = """SELECT w.id FROM idxINDEX AS w WHERE w.name=%s LIMIT 1""" res = run_sql(query, (index_name,), 1) if res: out = res[0][0] return out def get_index_name_from_index_id(index_id): """Returns the words/phrase index name for INDEXID. Returns '' in case there is no words table for this indexid. Example: field=9, output='fulltext'.""" res = run_sql("SELECT name FROM idxINDEX WHERE id=%s", (index_id,)) if res: return res[0][0] return '' def get_field_tags(field, tagtype="marc"): """Returns a list of tags for the field code 'field'. Works for both MARC and nonMARC tags. Returns empty list in case of error. Example: field='author', output=['100__%','700__%']. @param tagtype: can be: "marc" or "nonmarc", default value is "marc" for backward compatibility """ out = [] query = """SELECT t.%s FROM tag AS t, field_tag AS ft, field AS f WHERE f.code=%%s AND ft.id_field=f.id AND t.id=ft.id_tag ORDER BY ft.score DESC""" if tagtype == "marc": query = query % "value" res = run_sql(query, (field,)) return [row[0] for row in res] else: query = query % "recjson_value" res = run_sql(query, (field,)) values = [] for row in res: if row[0] is not None: values.extend(row[0].split(",")) return values def get_marc_tag_indexes(tag, virtual=True): """Returns indexes names and ids corresponding to the given tag @param tag: MARC tag in one of the forms: 'xx%', 'xxx', 'xxx__a', 'xxx__%' @param virtual: if True function will also return virtual indexes""" tag2 = tag[0:2] + "%" #for tags in the form: 10% tag3 = tag[:-1] + "%" #for tags in the form: 100__% query = """SELECT DISTINCT w.id,w.name FROM idxINDEX AS w, idxINDEX_field AS wf, field_tag AS ft, tag as t WHERE (t.value=%%s OR t.value=%%s OR %s) AND t.id=ft.id_tag AND ft.id_field=wf.id_field AND wf.id_idxINDEX=w.id""" if tag[-1] == "%": missing_piece = "t.value LIKE %s" elif tag[-1] != "%" and len(tag) == 3: missing_piece = "t.value LIKE %s" tag3 = tag + "%" #for all tags which start from 'tag' else: missing_piece = "t.value=%s" query = query % missing_piece res = run_sql(query, (tag, tag2, tag3)) if res: if virtual: response = list(res) index_ids = map(str, zip(*res)[0]) query = """SELECT DISTINCT v.id_virtual,w.name FROM idxINDEX_idxINDEX AS v, idxINDEX as w WHERE v.id_virtual=w.id AND v.id_normal IN (""" query = query + ", ".join(index_ids) + ")" response.extend(run_sql(query)) return tuple(response) return res return () def get_nonmarc_tag_indexes(nonmarc_tag, virtual=True): """Returns index names and ids corresponding to the given nonmarc tag (nonmarc tag can be also called 'bibfield field'). If param 'virtual' is set to True function will also return virtual indexes""" query = """SELECT DISTINCT w.id, w.name FROM idxINDEX AS w, idxINDEX_field AS wf, field_tag AS ft, tag as t WHERE (t.recjson_value LIKE %s OR t.recjson_value LIKE %s OR t.recjson_value LIKE %s OR t.recjson_value=%s) AND t.id=ft.id_tag AND ft.id_field=wf.id_field AND wf.id_idxINDEX=w.id""" at_the_begining = nonmarc_tag + ',%%' in_the_middle = '%%,' + nonmarc_tag + ',%%' at_the_end = '%%,' + nonmarc_tag res = run_sql(query, (at_the_begining, in_the_middle, at_the_end, nonmarc_tag)) if res: if virtual: response = list(res) index_ids = map(str, zip(*res)[0]) query = """SELECT DISTINCT v.id_virtual,w.name FROM idxINDEX_idxINDEX AS v, idxINDEX as w WHERE v.id_virtual=w.id AND v.id_normal IN (""" query = query + ", ".join(index_ids) + ")" response.extend(run_sql(query)) return tuple(response) return res return () def get_index_tags(indexname, virtual=True, tagtype="marc"): """Returns the list of tags that are indexed inside INDEXNAME. Returns empty list in case there are no tags indexed in this index. Note: uses get_field_tags() defined before. Example: field='author', output=['100__%', '700__%']. @param tagtype: can be: "marc" or "nonmarc", default value is "marc" for backward compatibility """ out = [] query = """SELECT f.code FROM idxINDEX AS w, idxINDEX_field AS wf, field AS f WHERE w.name=%s AND w.id=wf.id_idxINDEX AND f.id=wf.id_field""" res = run_sql(query, (indexname,)) for row in res: out.extend(get_field_tags(row[0], tagtype)) if not out and virtual: index_id = get_index_id_from_index_name(indexname) try: dependent_indexes = map(str, zip(*get_virtual_index_building_blocks(index_id))[0]) except IndexError: return out tags = set() query = """SELECT DISTINCT f.code FROM idxINDEX AS w, idxINDEX_field AS wf, field AS f WHERE w.id=wf.id_idxINDEX AND f.id=wf.id_field AND w.id IN (""" query = query + ", ".join(dependent_indexes) + ")" res = run_sql(query) for row in res: tags |= set(get_field_tags(row[0], tagtype)) out = list(tags) out = [tag for tag in out if tag] return out def get_min_last_updated(indexes): """Returns min modification date for 'indexes': min(last_updated) @param indexes: list of indexes """ query= """SELECT min(last_updated) FROM idxINDEX WHERE name IN (""" for index in indexes: query += "%s," query = query[:-1] + ")" res = run_sql(query, tuple(indexes)) return res def remove_inexistent_indexes(indexes, leave_virtual=False): """Removes indexes that don't exist from the given list of indexes. @param indexes: list of indexes @param leave_virtual: should we leave virtual indexes in the list? """ correct_indexes = get_all_indexes(leave_virtual) cleaned = [] for index in indexes: if index in correct_indexes: cleaned.append(index) return cleaned def get_records_range_for_index(index_id): """ Get records range for given index. """ try: query = """SELECT min(id_bibrec), max(id_bibrec) FROM idxWORD%02dR""" % index_id resp = run_sql(query) if resp: return resp[0] return None except Exception: return None def make_prefix(index_name): """ Creates a prefix for specific index which is added to every word from this index stored in reversed table of corresponding virtual index. @param index_name: name of the dependent index we want to create prefix for """ return "__" + index_name + "__" class UnknownTokenizer(Exception): pass def list_union(list1, list2): "Returns union of the two lists." union_dict = {} for e in list1: union_dict[e] = 1 for e in list2: union_dict[e] = 1 return union_dict.keys() def get_index_fields(index_id): """Returns fields that are connected to index specified by index_id. """ query = """SELECT f.id, f.name FROM field as f, idxINDEX as w, idxINDEX_field as wf WHERE f.id=wf.id_field AND wf.id_idxINDEX=w.id AND w.id=%s """ index_fields = run_sql(query, (index_id, ) ) return index_fields def recognize_marc_tag(tag): """Checks if tag is a MARC tag or not""" tag_len = len(tag) if 3 <= tag_len <= 6 and tag[0:3].isdigit(): return True if tag_len == 3 and tag[0:2].isdigit() and tag[2] == '%': return True return False def _is_collection(subfield): """Checks if a type is a collection; get_values_recursively internal function.""" return hasattr(subfield, '__iter__') def _get_values(subfield): """Returns values of a subfield suitable for later tokenizing; get_values_recursively internal function.""" if type(subfield) == dict: return subfield.values() else: return subfield def get_values_recursively(subfield, phrases): """Finds all values suitable for later tokenizing in field/subfield of bibfield record. @param subfield: name of the field/subfield @param phrases: container for phrases (for example empty list) FIXME: move this function to bibfield! As soon as possible. Note that journal tokenizer also needs to be changed. """ if _is_collection(subfield): for s in _get_values(subfield): get_values_recursively(s, phrases) elif subfield is not None: phrases.append(utf8ifier(subfield)) diff --git a/invenio/legacy/bibindex/engine_washer.py b/invenio/legacy/bibindex/engine_washer.py index ed1484127..2a31f8d94 100644 --- a/invenio/legacy/bibindex/engine_washer.py +++ b/invenio/legacy/bibindex/engine_washer.py @@ -1,169 +1,169 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2009, 2010, 2011, 2012, 2013 CERN. +# Copyright (C) 2009, 2010, 2011, 2012, 2013, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. import re from invenio.legacy.bibindex.engine_stemmer import stem from invenio.legacy.bibindex.engine_stopwords import is_stopword from invenio.config import CFG_BIBINDEX_MIN_WORD_LENGTH, \ CFG_ETCDIR re_pattern_fuzzy_author_dots = re.compile(r'[\.\-]+') re_pattern_fuzzy_author_spaces = re.compile(r'\s+') re_pattern_author_canonical_id = re.compile(r'\.[0-9]+$') re_unicode_lowercase_a = re.compile(unicode(r"(?u)[áàäâãå]", "utf-8")) re_unicode_lowercase_ae = re.compile(unicode(r"(?u)[æ]", "utf-8")) re_unicode_lowercase_e = re.compile(unicode(r"(?u)[éèëê]", "utf-8")) re_unicode_lowercase_i = re.compile(unicode(r"(?u)[íìïî]", "utf-8")) re_unicode_lowercase_o = re.compile(unicode(r"(?u)[óòöôõø]", "utf-8")) re_unicode_lowercase_u = re.compile(unicode(r"(?u)[úùüû]", "utf-8")) re_unicode_lowercase_y = re.compile(unicode(r"(?u)[ýÿ]", "utf-8")) re_unicode_lowercase_c = re.compile(unicode(r"(?u)[çć]", "utf-8")) re_unicode_lowercase_n = re.compile(unicode(r"(?u)[ñ]", "utf-8")) re_unicode_uppercase_a = re.compile(unicode(r"(?u)[ÁÀÄÂÃÅ]", "utf-8")) re_unicode_uppercase_ae = re.compile(unicode(r"(?u)[Æ]", "utf-8")) re_unicode_uppercase_e = re.compile(unicode(r"(?u)[ÉÈËÊ]", "utf-8")) re_unicode_uppercase_i = re.compile(unicode(r"(?u)[ÍÌÏÎ]", "utf-8")) re_unicode_uppercase_o = re.compile(unicode(r"(?u)[ÓÒÖÔÕØ]", "utf-8")) re_unicode_uppercase_u = re.compile(unicode(r"(?u)[ÚÙÜÛ]", "utf-8")) re_unicode_uppercase_y = re.compile(unicode(r"(?u)[Ý]", "utf-8")) re_unicode_uppercase_c = re.compile(unicode(r"(?u)[ÇĆ]", "utf-8")) re_unicode_uppercase_n = re.compile(unicode(r"(?u)[Ñ]", "utf-8")) re_latex_lowercase_a = re.compile("\\\\[\"H'`~^vu=k]\{?a\}?") re_latex_lowercase_ae = re.compile("\\\\ae\\{\\}?") re_latex_lowercase_e = re.compile("\\\\[\"H'`~^vu=k]\\{?e\\}?") re_latex_lowercase_i = re.compile("\\\\[\"H'`~^vu=k]\\{?i\\}?") re_latex_lowercase_o = re.compile("\\\\[\"H'`~^vu=k]\\{?o\\}?") re_latex_lowercase_u = re.compile("\\\\[\"H'`~^vu=k]\\{?u\\}?") re_latex_lowercase_y = re.compile("\\\\[\"']\\{?y\\}?") re_latex_lowercase_c = re.compile("\\\\['uc]\\{?c\\}?") re_latex_lowercase_n = re.compile("\\\\[c'~^vu]\\{?n\\}?") re_latex_uppercase_a = re.compile("\\\\[\"H'`~^vu=k]\\{?A\\}?") re_latex_uppercase_ae = re.compile("\\\\AE\\{?\\}?") re_latex_uppercase_e = re.compile("\\\\[\"H'`~^vu=k]\\{?E\\}?") re_latex_uppercase_i = re.compile("\\\\[\"H'`~^vu=k]\\{?I\\}?") re_latex_uppercase_o = re.compile("\\\\[\"H'`~^vu=k]\\{?O\\}?") re_latex_uppercase_u = re.compile("\\\\[\"H'`~^vu=k]\\{?U\\}?") re_latex_uppercase_y = re.compile("\\\\[\"']\\{?Y\\}?") re_latex_uppercase_c = re.compile("\\\\['uc]\\{?C\\}?") re_latex_uppercase_n = re.compile("\\\\[c'~^vu]\\{?N\\}?") def lower_index_term(term): """ Return safely lowered index term TERM. This is done by converting to UTF-8 first, because standard Python lower() function is not UTF-8 safe. To be called by both the search engine and the indexer when appropriate (e.g. before stemming). In case of problems with UTF-8 compliance, this function raises UnicodeDecodeError, so the client code may want to catch it. """ return unicode(term, 'utf-8').lower().encode('utf-8') latex_markup_re = re.compile(r"\\begin(\[.+?\])?\{.+?\}|\\end\{.+?}|\\\w+(\[.+?\])?\{(?P.*?)\}|\{\\\w+ (?P.*?)\}") def remove_latex_markup(phrase): ret_phrase = '' index = 0 for match in latex_markup_re.finditer(phrase): ret_phrase += phrase[index:match.start()] ret_phrase += match.group('inside1') or match.group('inside2') or '' index = match.end() ret_phrase += phrase[index:] return ret_phrase def apply_stemming(word, stemming_language): """Returns word after applying stemming (if stemming language is set). You can change your stemming language in database. @param word: word to be checked @type word: str @param stemming_language: abbreviation of language or None @type stemming_language: str """ if stemming_language: word = stem(word, stemming_language) return word def remove_stopwords(word, stopwords_kb=None): """Returns word after stopword check. One must specify the name of the knowledge base. @param word: word to be checked @type word: str @param stopwords_kb: name of the stopwords knowledge base @type word: str """ if stopwords_kb is not None: if is_stopword(word, stopwords_kb): return "" return word def length_check(word): """Returns word after length check. @param word: word to be checked @type word: str """ if len(word) < CFG_BIBINDEX_MIN_WORD_LENGTH: return "" return word def wash_index_term(term, max_char_length=50, lower_term=True): """ Return washed form of the index term TERM that would be suitable for storing into idxWORD* tables. I.e., lower the TERM if LOWER_TERM is True, and truncate it safely to MAX_CHAR_LENGTH UTF-8 characters (meaning, in principle, 4*MAX_CHAR_LENGTH bytes). The function works by an internal conversion of TERM, when needed, from its input Python UTF-8 binary string format into Python Unicode format, and then truncating it safely to the given number of UTF-8 characters, without possible mis-truncation in the middle of a multi-byte UTF-8 character that could otherwise happen if we would have been working with UTF-8 binary representation directly. Note that MAX_CHAR_LENGTH corresponds to the length of the term column in idxINDEX* tables. """ if lower_term: washed_term = unicode(term, 'utf-8').lower() else: washed_term = unicode(term, 'utf-8') if len(washed_term) <= max_char_length: # no need to truncate the term, because it will fit # nicely even if it uses four-byte UTF-8 characters return washed_term.encode('utf-8') else: # truncate the term in a safe position: return washed_term[:max_char_length].encode('utf-8') def wash_author_name(p): """ Wash author name suitable for author searching. Notably, replace dots and hyphens with spaces, and collapse spaces. """ if re_pattern_author_canonical_id.search(p): # we have canonical author ID form, so ignore all washing return p out = re_pattern_fuzzy_author_dots.sub(" ", p) out = re_pattern_fuzzy_author_spaces.sub(" ", out) return out.strip() diff --git a/invenio/modules/formatter/templates/format/record/Default_HTML_detailed.tpl b/invenio/modules/formatter/templates/format/record/Default_HTML_detailed.tpl index 903519278..31d66cd72 100644 --- a/invenio/modules/formatter/templates/format/record/Default_HTML_detailed.tpl +++ b/invenio/modules/formatter/templates/format/record/Default_HTML_detailed.tpl @@ -1,56 +1,56 @@ {# # This file is part of Invenio. -# Copyright (C) 2014 CERN. +# Copyright (C) 2014, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. #} {% extends "format/record/Default_HTML_detailed_base.tpl" %} {% block header %} {{ bfe_topbanner(bfo, prefix='
', suffix='

') }} {{ bfe_title(bfo, separator="

") }} {% endblock %} {% block details %} {{ bfe_authors(bfo, suffix="
", limit="25", interactive="yes", print_affiliations="yes", affiliation_prefix=" (", affiliation_suffix=")") }} {{ bfe_addresses(bfo) }} {{ bfe_affiliation(bfo) }} {{ bfe_date(bfo, prefix="
", suffix="
") }} {{ bfe_publisher(bfo, prefix="", suffix="") }} {{ bfe_place(bfo, prefix="", suffix="") }} {{ bfe_isbn(bfo, prefix="
ISBN: ") }} {% endblock %} {% block abstract %} {{ bfe_abstract(bfo, prefix_en="Abstract: ", prefix_fr="Résumé: ", suffix_en="
", suffix_fr="

") }} {{ bfe_keywords(bfo, prefix="
Keyword(s): ", keyword_prefix="", keyword_suffix="") }} {{ bfe_notes(bfo, note_prefix="
Note: ", note_suffix=" ", suffix="
") }} {{ bfe_publi_info(bfo, prefix="

Published in: ") }}
{{ bfe_doi(bfo, prefix="DOI: ", suffix="
") }} {{ bfe_plots(bfo, width="200px", caption="no") }} {% endblock %} {% block footer %} {{ bfe_appears_in_collections(bfo, prefix="

The record appears in these collections:
", suffix="

") }} {# WebTags #} {{ tfn_webtag_record_tags(record.get('recid'), current_user.get_id())|prefix('
') }} {{ tfn_get_back_to_search_links(record.get('recid'))|wrap(prefix='') }} {% endblock %} diff --git a/invenio/modules/indexer/testsuite/test_indexer_engine_tokenizer.py b/invenio/modules/indexer/testsuite/test_indexer_engine_tokenizer.py index 78d5d40ab..9d9e9076c 100644 --- a/invenio/modules/indexer/testsuite/test_indexer_engine_tokenizer.py +++ b/invenio/modules/indexer/testsuite/test_indexer_engine_tokenizer.py @@ -1,369 +1,369 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012, 2013 CERN. +# Copyright (C) 2010, 2011, 2012, 2013, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """bibindex_engine_tokenizer_tests - unit tests for tokenizers There should always be at least one test class for each class in b_e_t. """ from invenio.base.wrappers import lazy_import from invenio.testsuite import make_test_suite, run_test_suite, InvenioTestCase load_tokenizers = lazy_import('invenio.legacy.bibindex.engine_utils:load_tokenizers') _TOKENIZERS = None class TestAuthorTokenizerScanning(InvenioTestCase): """Test BibIndex name tokenization""" def setUp(self): _TOKENIZERS = load_tokenizers() self.tokenizer = _TOKENIZERS["BibIndexAuthorTokenizer"]() self.scan = self.tokenizer.scan_string_for_phrases def test_bifnt_scan_single(self): """BibIndexAuthorTokenizer - scanning single names like 'Dido'""" teststr = "Dido" output = self.scan(teststr) anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Dido'], 'nonlastnames': [], 'titles': [], 'raw' : teststr} self.assertEqual(output, anticipated) def test_bifnt_scan_simple_western_forward(self): """BibIndexAuthorTokenizer - scanning simple Western-style: first last""" teststr = "Ringo Starr" output = self.scan(teststr) anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Starr'], 'nonlastnames': ['Ringo'], 'titles': [], 'raw' : teststr} self.assertEqual(output, anticipated) def test_bifnt_scan_simple_western_reverse(self): """BibIndexAuthorTokenizer - scanning simple Western-style: last, first""" teststr = "Starr, Ringo" output = self.scan(teststr) anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Starr'], 'nonlastnames': ['Ringo'], 'titles': [], 'raw' : teststr} self.assertEqual(output, anticipated) def test_bifnt_scan_multiname_forward(self): """BibIndexAuthorTokenizer - scanning multiword: first middle last""" teststr = "Michael Edward Peskin" output = self.scan(teststr) anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Peskin'], 'nonlastnames': ['Michael', 'Edward'], 'titles': [], 'raw' : teststr} self.assertEqual(output, anticipated) def test_bifnt_scan_multiname_dotcrammed(self): """BibIndexAuthorTokenizer - scanning multiword: f.m. last""" teststr = "M.E. Peskin" output = self.scan(teststr) anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Peskin'], 'nonlastnames': ['M', 'E'], 'titles': [], 'raw' : teststr} self.assertEqual(output, anticipated) def test_bifnt_scan_multiname_dotcrammed_reversed(self): """BibIndexAuthorTokenizer - scanning multiword: last, f.m.""" teststr = "Peskin, M.E." output = self.scan(teststr) anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Peskin'], 'nonlastnames': ['M', 'E'], 'titles': [], 'raw' : teststr} self.assertEqual(output, anticipated) def test_bifnt_scan_multiname_dashcrammed(self): """BibIndexAuthorTokenizer - scanning multiword: first-middle last""" teststr = "Jean-Luc Picard" output = self.scan(teststr) anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Picard'], 'nonlastnames': ['Jean', 'Luc'], 'titles': [], 'raw' : teststr} self.assertEqual(output, anticipated) def test_bifnt_scan_multiname_dashcrammed_reversed(self): """BibIndexAuthorTokenizer - scanning multiword: last, first-middle""" teststr = "Picard, Jean-Luc" output = self.scan(teststr) anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Picard'], 'nonlastnames': ['Jean', 'Luc'], 'titles': [], 'raw' : teststr} self.assertEqual(output, anticipated) def test_bifnt_scan_compound_lastname_dashes(self): """BibIndexAuthorTokenizer - scanning multiword: first middle last-last""" teststr = "Cantina Octavia Jones-Smith" output = self.scan(teststr) anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Jones', 'Smith'], 'nonlastnames': ['Cantina', 'Octavia'], 'titles': [], 'raw' : teststr} self.assertEqual(output, anticipated) def test_bifnt_scan_compound_lastname_dashes_reverse(self): """BibIndexAuthorTokenizer - scanning multiword: last-last, first middle""" teststr = "Jones-Smith, Cantina Octavia" output = self.scan(teststr) anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Jones', 'Smith'], 'nonlastnames': ['Cantina', 'Octavia'], 'titles': [], 'raw' : teststr} self.assertEqual(output, anticipated) def test_bifnt_scan_compound_lastname_reverse(self): """BibIndexAuthorTokenizer - scanning compound last: last last, first""" teststr = "Alvarez Gaume, Joachim" output = self.scan(teststr) anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Alvarez', 'Gaume'], 'nonlastnames': ['Joachim'], 'titles': [], 'raw' : teststr} self.assertEqual(output, anticipated) def test_bifnt_scan_titled(self): """BibIndexAuthorTokenizer - scanning title-bearing: last, first, title""" teststr = "Epstein, Brian, The Fifth Beatle" output = self.scan(teststr) anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Epstein'], 'nonlastnames': ['Brian'], 'titles': ['The Fifth Beatle'], 'raw' : teststr} self.assertEqual(output, anticipated) def test_bifnt_scan_wildly_interesting(self): """BibIndexAuthorTokenizer - scanning last last last, first first, title, title""" teststr = "Ibanez y Gracia, Maria Luisa, II., ed." output = self.scan(teststr) anticipated = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Ibanez', 'y', 'Gracia'], 'nonlastnames': ['Maria', 'Luisa'], 'titles': ['II.', 'ed.'], 'raw' : teststr} self.assertEqual(output, anticipated) class TestAuthorTokenizerTokens(InvenioTestCase): """Test BibIndex name variant token generation from scanned and tagged sets""" def setUp(self): _TOKENIZERS = load_tokenizers() self.tokenizer = _TOKENIZERS["BibIndexAuthorTokenizer"]() self.get_index_tokens = self.tokenizer.parse_scanned_for_phrases def test_bifnt_tokenize_single(self): """BibIndexAuthorTokenizer - tokens for single-word name Ronaldo """ tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Ronaldo'], 'nonlastnames': [], 'titles': [], 'raw' : 'Ronaldo'} output = self.get_index_tokens(tagged_data) anticipated = ['Ronaldo'] self.assertEqual(output, anticipated) def test_bifnt_tokenize_simple_forward(self): """BibIndexAuthorTokenizer - tokens for first last Ringo Starr """ tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Starr'], 'nonlastnames': ['Ringo'], 'titles': [], 'raw' : 'Ringo Starr'} output = self.get_index_tokens(tagged_data) anticipated = ['R Starr', 'Ringo Starr', 'Starr, R', 'Starr, Ringo'] self.assertEqual(output, anticipated) def test_bifnt_tokenize_simple_reverse(self): """BibIndexAuthorTokenizer - tokens for last, first Starr, Ringo """ tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Starr'], 'nonlastnames': ['Ringo'], 'titles': [], 'raw' : 'Starr, Ringo'} output = self.get_index_tokens(tagged_data) anticipated = ['R Starr', 'Ringo Starr', 'Starr, R', 'Starr, Ringo'] self.assertEqual(output, anticipated) def test_bifnt_tokenize_twoname_forward(self): """BibIndexAuthorTokenizer - tokens for first middle last Michael Edward Peskin """ tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Peskin'], 'nonlastnames': ['Michael', 'Edward'], 'titles': [], 'raw' : 'Michael Edward Peskin'} output = self.get_index_tokens(tagged_data) anticipated = ['E Peskin', 'Edward Peskin', 'M E Peskin', 'M Edward Peskin', 'M Peskin', 'Michael E Peskin', 'Michael Edward Peskin', 'Michael Peskin', 'Peskin, E', 'Peskin, Edward', 'Peskin, M', 'Peskin, M E', 'Peskin, M Edward', 'Peskin, Michael', 'Peskin, Michael E', 'Peskin, Michael Edward'] self.assertEqual(output, anticipated) def test_bifnt_tokenize_compound_last(self): """BibIndexAuthorTokenizer - tokens for last last, first Alvarez Gaume, Joachim """ tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Alvarez', 'Gaume'], 'nonlastnames': ['Joachim'], 'titles': [], 'raw' : 'Alvarez Gaume, Joachim'} output = self.get_index_tokens(tagged_data) anticipated = ['Alvarez Gaume, J', 'Alvarez Gaume, Joachim', 'Alvarez, J', 'Alvarez, Joachim', 'Gaume, J', 'Gaume, Joachim', 'J Alvarez', 'J Alvarez Gaume', 'J Gaume', 'Joachim Alvarez', 'Joachim Alvarez Gaume', 'Joachim Gaume'] self.assertEqual(output, anticipated) def test_bifnt_tokenize_titled(self): """BibIndexAuthorTokenizer - tokens for last, first, title Epstein, Brian, The Fifth Beatle """ tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Epstein'], 'nonlastnames': ['Brian'], 'titles': ['The Fifth Beatle'], 'raw' : 'Epstein, Brian, The Fifth Beatle'} output = self.get_index_tokens(tagged_data) anticipated = ['B Epstein', 'B Epstein, The Fifth Beatle', 'Brian Epstein', 'Brian Epstein, The Fifth Beatle', 'Epstein, B', 'Epstein, B, The Fifth Beatle', 'Epstein, Brian', 'Epstein, Brian, The Fifth Beatle'] self.assertEqual(output, anticipated) def test_bifnt_tokenize_wildly_interesting(self): """BibIndexAuthorTokenizer - tokens for last last last, first first, title, title Ibanez y Gracia, Maria Luisa, II, (ed.) """ tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Ibanez', 'y', 'Gracia'], 'nonlastnames': ['Maria', 'Luisa'], 'titles': ['II', '(ed.)'], 'raw' : 'Ibanez y Gracia, Maria Luisa, II, (ed.)'} output = self.get_index_tokens(tagged_data) anticipated = ['Gracia, L', 'Gracia, Luisa', 'Gracia, M', 'Gracia, M L', 'Gracia, M Luisa', 'Gracia, Maria', 'Gracia, Maria L', 'Gracia, Maria Luisa', 'Ibanez y Gracia, L', 'Ibanez y Gracia, L, II', 'Ibanez y Gracia, Luisa', 'Ibanez y Gracia, Luisa, II', 'Ibanez y Gracia, M', 'Ibanez y Gracia, M L', 'Ibanez y Gracia, M L, II', 'Ibanez y Gracia, M Luisa', 'Ibanez y Gracia, M Luisa, II', 'Ibanez y Gracia, M, II', 'Ibanez y Gracia, Maria', 'Ibanez y Gracia, Maria L', 'Ibanez y Gracia, Maria L, II', 'Ibanez y Gracia, Maria Luisa', 'Ibanez y Gracia, Maria Luisa, II', 'Ibanez y Gracia, Maria, II', 'Ibanez, L', 'Ibanez, Luisa', 'Ibanez, M', 'Ibanez, M L', 'Ibanez, M Luisa', 'Ibanez, Maria', 'Ibanez, Maria L', 'Ibanez, Maria Luisa', 'L Gracia', 'L Ibanez', 'L Ibanez y Gracia', 'L Ibanez y Gracia, II', 'Luisa Gracia', 'Luisa Ibanez', 'Luisa Ibanez y Gracia', 'Luisa Ibanez y Gracia, II', 'M Gracia', 'M Ibanez', 'M Ibanez y Gracia', 'M Ibanez y Gracia, II', 'M L Gracia', 'M L Ibanez', 'M L Ibanez y Gracia', 'M L Ibanez y Gracia, II', 'M Luisa Gracia', 'M Luisa Ibanez', 'M Luisa Ibanez y Gracia', 'M Luisa Ibanez y Gracia, II', 'Maria Gracia', 'Maria Ibanez', 'Maria Ibanez y Gracia', 'Maria Ibanez y Gracia, II', 'Maria L Gracia', 'Maria L Ibanez', 'Maria L Ibanez y Gracia', 'Maria L Ibanez y Gracia, II', 'Maria Luisa Gracia', 'Maria Luisa Ibanez', 'Maria Luisa Ibanez y Gracia', 'Maria Luisa Ibanez y Gracia, II'] self.assertEqual(output, anticipated) def test_bifnt_tokenize_multimiddle_forward(self): """BibIndexAuthorTokenizer - tokens for first middle middle last W K H Panofsky """ tagged_data = {'TOKEN_TAG_LIST': ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames': ['Panofsky'], 'nonlastnames': ['W', 'K', 'H'], 'titles': [], 'raw' : 'W K H Panofsky'} output = self.get_index_tokens(tagged_data) anticipated = ['H Panofsky', 'K H Panofsky', 'K Panofsky', 'Panofsky, H', 'Panofsky, K', 'Panofsky, K H', 'Panofsky, W', 'Panofsky, W H', 'Panofsky, W K', 'Panofsky, W K H', 'W H Panofsky', 'W K H Panofsky', 'W K Panofsky', 'W Panofsky'] self.assertEqual(output, anticipated) def test_tokenize(self): """BibIndexAuthorTokenizer - check tokenize_for_phrases() Ringo Starr """ teststr = "Ringo Starr" output = self.tokenizer.tokenize_for_phrases(teststr) anticipated = ['R Starr', 'Ringo Starr', 'Starr, R', 'Starr, Ringo'] self.assertEqual(output, anticipated) class TestExactAuthorTokenizer(InvenioTestCase): """Test exact author name tokenizer.""" def setUp(self): """setup""" _TOKENIZERS = load_tokenizers() self.tokenizer = _TOKENIZERS["BibIndexExactAuthorTokenizer"]() self.tokenize = self.tokenizer.tokenize_for_phrases def test_exact_author_name_tokenizer_bare(self): """BibIndexExactNameTokenizer - bare name""" self.assertEqual(self.tokenize('John Doe'), ['John Doe']) def test_exact_author_name_tokenizer_dots(self): """BibIndexExactNameTokenizer - name with dots""" self.assertEqual(self.tokenize('J. Doe'), ['J Doe']) self.assertEqual(self.tokenize('J.R. Doe'), ['J R Doe']) self.assertEqual(self.tokenize('J. R. Doe'), ['J R Doe']) def test_exact_author_name_tokenizer_trailing_dots(self): """BibIndexExactNameTokenizer - name with trailing dots""" self.assertEqual(self.tokenize('Doe, J'), ['Doe, J']) self.assertEqual(self.tokenize('Doe, J.'), ['Doe, J']) def test_exact_author_name_tokenizer_hyphens(self): """BibIndexExactNameTokenizer - name with hyphens""" self.assertEqual(self.tokenize('Doe, Jean-Pierre'), ['Doe, Jean Pierre']) class TestCJKTokenizer(InvenioTestCase): """Tests for CJK Tokenizer which splits CJK words into characters and treats every single character as a word""" @classmethod def setUp(self): _TOKENIZERS = load_tokenizers() self.tokenizer = _TOKENIZERS["BibIndexCJKTokenizer"]() def test_tokenize_for_words_phrase_galaxy(self): """tokenizing phrase: galaxy s4据信""" phrase = "galaxy s4据信" result = self.tokenizer.tokenize_for_words(phrase) self.assertEqual(sorted(['galaxy','s4','据','信']), sorted(result)) def test_tokenize_for_words_phrase_with_special_punctuation(self): """tokenizing phrase: 马英九:台湾民""" phrase = u"马英九:台湾民" result = self.tokenizer.tokenize_for_words(phrase) self.assertEqual(sorted(['马','英','九','台','湾','民']), sorted(result)) def test_tokenize_for_words_phrase_with_special_punctuation_two(self): """tokenizing phrase: 色的“刀子嘴”""" phrase = u"色的“刀子嘴”" result = self.tokenizer.tokenize_for_words(phrase) self.assertEqual(sorted(['色','的','刀','子','嘴']), sorted(result)) def test_tokenize_for_words_simple_phrase(self): """tokenizing phrase: 春眠暁覚""" self.assertEqual(sorted(self.tokenizer.tokenize_for_words(u'春眠暁覚')), sorted(['春', '眠', '暁', '覚'])) def test_tokenize_for_words_mixed_phrase(self): """tokenizing phrase: 春眠暁ABC覚""" self.assertEqual(sorted(self.tokenizer.tokenize_for_words(u'春眠暁ABC覚')), sorted(['春', '眠', '暁', 'abc', '覚'])) def test_tokenize_for_words_phrase_with_comma(self): """tokenizing phrase: 春眠暁, 暁""" phrase = u"春眠暁, 暁" self.assertEqual(sorted(self.tokenizer.tokenize_for_words(phrase)), sorted(['春','眠','暁'])) TEST_SUITE = make_test_suite(TestAuthorTokenizerScanning, TestAuthorTokenizerTokens, TestExactAuthorTokenizer, TestCJKTokenizer) if __name__ == '__main__': run_test_suite(TEST_SUITE) diff --git a/invenio/modules/indexer/tokenizers/BibIndexAuthorCountTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexAuthorCountTokenizer.py index 0cd14d179..09595476d 100644 --- a/invenio/modules/indexer/tokenizers/BibIndexAuthorCountTokenizer.py +++ b/invenio/modules/indexer/tokenizers/BibIndexAuthorCountTokenizer.py @@ -1,62 +1,62 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012, 2014 CERN. +# Copyright (C) 2010, 2011, 2012, 2014, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """BibIndexAuthorCountTokenizer: counts number of authors for any publication given by recID. Will look at tags: '100_a' and '700_a' which are: 'first author name' and 'additional author name'. """ from invenio.legacy.bibindex.engine_utils import get_field_count from invenio.modules.indexer.tokenizers.BibIndexMultiFieldTokenizer import BibIndexMultiFieldTokenizer from invenio.modules.records.api import get_record class BibIndexAuthorCountTokenizer(BibIndexMultiFieldTokenizer): """ Returns a number of authors who created a publication with given recID in the database. Takes recID of the record as an argument to tokenizing function. Calculates terms based on information from multiple tags. For more information on this type of tokenizers take a look on BibIndexAuthorCountTokenizer base class. """ def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): self.tags = ['100__a', '700__a'] self.nonmarc_tag = 'number_of_authors' def tokenize(self, recID): """Uses get_field_count from bibindex.engine_utils for finding a number of authors of a publication and pass it in the list""" return [str(get_field_count(recID, self.tags)),] def tokenize_via_recjson(self, recID): """ Will tokenize with use of bibfield. @param recID: id of the record """ rec = get_record(recID) return [str(rec.get(self.nonmarc_tag) or 0)] def get_tokenizing_function(self, wordtable_type): return self.tokenize def get_nonmarc_tokenizing_function(self, table_type): return self.tokenize_via_recjson diff --git a/invenio/modules/indexer/tokenizers/BibIndexAuthorTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexAuthorTokenizer.py index b7011fb67..c990e42aa 100644 --- a/invenio/modules/indexer/tokenizers/BibIndexAuthorTokenizer.py +++ b/invenio/modules/indexer/tokenizers/BibIndexAuthorTokenizer.py @@ -1,336 +1,336 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012 CERN. +# Copyright (C) 2010, 2011, 2012, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """BibIndexAuthorTokenizer: tokenizer introduced for author index. It tokenizes author name in a fuzzy way. Creates different variants of an author name. For example: John Cleese will be tokenized into: 'C John', 'Cleese John', 'John, C', 'John, Cleese' """ import re from invenio.config import CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import BibIndexDefaultTokenizer class BibIndexAuthorTokenizer(BibIndexDefaultTokenizer): """Human name tokenizer. Human names are divided into three classes of tokens: 'lastnames', i.e., family, tribal or group identifiers, 'nonlastnames', i.e., personal names distinguishing individuals, 'titles', both incidental and permanent, e.g., 'VIII', '(ed.)', 'Msc' """ def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): BibIndexDefaultTokenizer.__init__(self, stemming_language, remove_stopwords, remove_html_markup, remove_latex_markup) self.single_initial_re = re.compile('^\w\.$') self.split_on_re = re.compile('[\.\s-]') # lastname_stopwords describes terms which should not be used for indexing, # in multiple-word last names. These are purely conjunctions, serving the # same function as the American hyphen, but using linguistic constructs. self.lastname_stopwords = set(['y', 'of', 'and', 'de']) def scan_string_for_phrases(self, s): """Scan a name string and output an object representing its structure. @param s: the input to be lexically tagged @type s: string @return: dict of lexically tagged input items. Sample output for the name 'Jingleheimer Schmitt, John Jacob, XVI.' is: { 'TOKEN_TAG_LIST' : ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames' : ['Jingleheimer', 'Schmitt'], 'nonlastnames' : ['John', 'Jacob'], 'titles' : ['XVI.'], 'raw' : 'Jingleheimer Schmitt, John Jacob, XVI.' } @rtype: dict """ retval = {'TOKEN_TAG_LIST' : ['lastnames', 'nonlastnames', 'titles', 'raw'], 'lastnames' : [], 'nonlastnames' : [], 'titles' : [], 'raw' : s} l = s.split(',') if len(l) < 2: # No commas means a simple name new = s.strip() new = s.split(' ') if len(new) == 1: retval['lastnames'] = new # rare single-name case else: retval['lastnames'] = new[-1:] retval['nonlastnames'] = new[:-1] for tag in ['lastnames', 'nonlastnames']: retval[tag] = [x.strip() for x in retval[tag]] retval[tag] = [re.split(self.split_on_re, x) for x in retval[tag]] # flatten sublists retval[tag] = [item for sublist in retval[tag] for item in sublist] retval[tag] = [x for x in retval[tag] if x != ''] else: # Handle lastname-first multiple-names case retval['titles'] = l[2:] # no titles? no problem retval['nonlastnames'] = l[1] retval['lastnames'] = l[0] for tag in ['lastnames', 'nonlastnames']: retval[tag] = retval[tag].strip() retval[tag] = re.split(self.split_on_re, retval[tag]) # filter empty strings retval[tag] = [x for x in retval[tag] if x != ''] retval['titles'] = [x.strip() for x in retval['titles'] if x != ''] return retval def parse_scanned_for_phrases(self, scanned): """Return all the indexable variations for a tagged token dictionary. Does this via the combinatoric expansion of the following rules: - Expands first names as name, first initial with period, first initial without period. - Expands compound last names as each of their non-stopword subparts. - Titles are treated literally, but applied serially. Please note that titles will be applied to complete last names only. So for example, if there is a compound last name of the form, "Ibanez y Gracia", with the title, "(ed.)", then only the combination of those two strings will do, not "Ibanez" and not "Gracia". @param scanned: lexically tagged input items in the form of the output from scan() @type scanned: dict @return: combinatorically expanded list of strings for indexing @rtype: list of string """ def _fully_expanded_last_name(first, lastlist, title = None): """Return a list of all of the first / last / title combinations. @param first: one possible non-last name @type first: string @param lastlist: the strings of the tokens in the (possibly compound) last name @type lastlist: list of string @param title: one possible title @type title: string """ retval = [] title_word = '' if title != None: title_word = ', ' + title last = ' '.join(lastlist) retval.append(first + ' ' + last + title_word) retval.append(last + ', ' + first + title_word) for last in lastlist: if last in self.lastname_stopwords: continue retval.append(first + ' ' + last + title_word) retval.append(last + ', ' + first + title_word) return retval last_parts = scanned['lastnames'] first_parts = scanned['nonlastnames'] titles = scanned['titles'] raw = scanned['raw'] if len(first_parts) == 0: # rare single-name case return scanned['lastnames'] expanded = [] for exp in self.__expand_nonlastnames(first_parts): expanded.extend(_fully_expanded_last_name(exp, last_parts, None)) for title in titles: # Drop titles which are parenthesized. This eliminates (ed.) from the index, but # leaves XI, for example. This gets rid of the surprising behavior that searching # for 'author:ed' retrieves people who have been editors, but whose names aren't # Ed. # TODO: Make editorship and other special statuses a MARC field. if title.find('(') != -1: continue # XXX: remember to document that titles can only be applied to complete last names expanded.extend(_fully_expanded_last_name(exp, [' '.join(last_parts)], title)) return sorted(list(set(expanded))) def __expand_nonlastnames(self, namelist): """Generate every expansion of a series of human non-last names. Example: "Michael Edward" -> "Michael Edward", "Michael E.", "Michael E", "M. Edward", "M Edward", "M. E.", "M. E", "M E.", "M E", "M.E." ...but never: "ME" @param namelist: a collection of names @type namelist: list of string @return: a greatly expanded collection of names @rtype: list of string """ def _expand_name(name): """Lists [name, initial, empty]""" if name == None: return [] return [name, name[0]] def _pair_items(head, tail): """Lists every combination of head with each and all of tail""" if len(tail) == 0: return [head] l = [] l.extend([head + ' ' + tail[0]]) #l.extend([head + '-' + tail[0]]) l.extend(_pair_items(head, tail[1:])) return l def _collect(head, tail): """Brings together combinations of things""" def _cons(a, l): l2 = l[:] l2.insert(0, a) return l2 if len(tail) == 0: return [head] l = [] l.extend(_pair_items(head, _expand_name(tail[0]))) l.extend([' '.join(_cons(head, tail)).strip()]) #l.extend(['-'.join(_cons(head, tail)).strip()]) l.extend(_collect(head, tail[1:])) return l def _expand_contract(namelist): """Runs collect with every head in namelist and its tail""" val = [] for i in range(len(namelist)): name = namelist[i] for expansion in _expand_name(name): val.extend(_collect(expansion, namelist[i+1:])) return val def _add_squashed(namelist): """Finds cases like 'M. E.' and adds 'M.E.'""" val = namelist def __check_parts(parts): if len(parts) < 2: return False for part in parts: if not self.single_initial_re.match(part): return False return True for name in namelist: parts = name.split(' ') if not __check_parts(parts): continue val.extend([''.join(parts)]) return val return _add_squashed(_expand_contract(namelist)) def tokenize_for_fuzzy_authors(self, phrase): """Output the list of strings expanding phrase. Does this via the combinatoric expansion of the following rules: - Expands first names as name, first initial with period, first initial without period. - Expands compound last names as each of their non-stopword subparts. - Titles are treated literally, but applied serially. Please note that titles will be applied to complete last names only. So for example, if there is a compound last name of the form, "Ibanez y Gracia", with the title, "(ed.)", then only the combination of those two strings will do, not "Ibanez" and not "Gracia". Old: BibIndexFuzzyAuthorTokenizer @param phrase: the input to be lexically tagged @type phrase: string @return: combinatorically expanded list of strings for indexing @rtype: list of string @note: A simple wrapper around scan and parse_scanned. """ return self.parse_scanned_for_phrases(self.scan_string_for_phrases(phrase)) def tokenize_for_phrases(self, phrase): """ Another name for tokenize_for_fuzzy_authors. It's for the compatibility. See: tokenize_for_fuzzy_authors """ return self.tokenize_for_fuzzy_authors(phrase) def tokenize_for_words_default(self, phrase): """Default tokenize_for_words inherited from default tokenizer""" return super(BibIndexAuthorTokenizer, self).tokenize_for_words(phrase) def get_author_family_name_words_from_phrase(self, phrase): """ Return list of words from author family names, not his/her first names. The phrase is assumed to be the full author name. This is useful for CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES. @param phrase: phrase to get family name from """ d_family_names = {} # first, treat everything before first comma as surname: if ',' in phrase: d_family_names[phrase.split(',', 1)[0]] = 1 # second, try fuzzy author tokenizer to find surname variants: for name in self.tokenize_for_phrases(phrase): if ',' in name: d_family_names[name.split(',', 1)[0]] = 1 # now extract words from these surnames: d_family_names_words = {} for family_name in d_family_names.keys(): for word in self.tokenize_for_words_default(family_name): d_family_names_words[word] = 1 return d_family_names_words.keys() def tokenize_for_words(self, phrase): """ If CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES is 1 we tokenize only for family names. In other case we perform standard tokenization for words. """ if CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES: return self.get_author_family_name_words_from_phrase(phrase) else: return self.tokenize_for_words_default(phrase) diff --git a/invenio/modules/indexer/tokenizers/BibIndexCJKTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexCJKTokenizer.py index c3908cdff..79c3a8c37 100644 --- a/invenio/modules/indexer/tokenizers/BibIndexCJKTokenizer.py +++ b/invenio/modules/indexer/tokenizers/BibIndexCJKTokenizer.py @@ -1,133 +1,133 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012 CERN. +# Copyright (C) 2010, 2011, 2012, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """BibIndexCJKTokenizer: makes search in collections with CJK papers and publications more reliable If phrase has characters from CJK language set tokenizer will treat it diffrently than phrase without these chars. CJK Tokenizer splits CJK words into single characters (it adds space between every two CJK characters). """ import re from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import BibIndexDefaultTokenizer is_character_from_CJK_set = re.compile(u'[\u3400-\u4DBF\u4E00-\u9FFF]') special_CJK_punctuation = re.compile(u'[\uff1a,\uff0c,\u3001,\u3002,\u201c,\u201d]') def is_from_CJK_set_single_character_match(char): if not isinstance(char, unicode): char = char.decode("utf8") res = is_character_from_CJK_set.match(char) if res: return True return False def is_from_CJK_set_full_match(text): if not isinstance(text, unicode): text = text.decode("utf8") res = is_character_from_CJK_set.findall(text) if len(res) == len(text): return True return False def is_there_any_CJK_character_in_text(text): if not isinstance(text, unicode): text = text.decode("utf8") res = is_character_from_CJK_set.search(text) if res is not None: return True return False def is_non_CJK_expression(word): return not is_there_any_CJK_character_in_text(word) class BibIndexCJKTokenizer(BibIndexDefaultTokenizer): """A phrase is split into CJK characters. CJK is Chinese, Japanese and Korean unified character set. It means that for example, phrase: '据信,新手机更轻' will be split into: ['据', '信', '新', '手', '机', '更', '轻']""" def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): """Initialisation""" BibIndexDefaultTokenizer.__init__(self, stemming_language, remove_stopwords, remove_html_markup, remove_latex_markup) def tokenize_for_words_default(self, phrase): """Default tokenize_for_words inherited from default tokenizer""" return super(BibIndexCJKTokenizer, self).tokenize_for_words(phrase) def tokenize_for_words(self, phrase): """ Splits phrase into words with additional spaces between CJK characters to enhance search for CJK papers and stuff. If there is no single CJK character in whole phrase it behaves the standard way: it splits phrase into words with use of BibIndexDefaultTokenizer's tokenize_for_words. @param phrase: CJK phrase to be tokenized @type phrase: string @return: list of CJK characters and non-CJK words @rtype: list of string """ if is_there_any_CJK_character_in_text(phrase): #remove special CJK punctuation phrase = special_CJK_punctuation.sub("", phrase) #first, we split our phrase with default word tokenizer to make it easier later pre_tokenized = self.tokenize_for_words_default(phrase) #list for keeping CJK chars and non-CJK words chars = [] #every CJK word splits into a set of single characters #for example: "春眠暁覚" into ['春','眠','暁','覚'] words = [ word.decode("utf8") for word in pre_tokenized] for word in words: if is_from_CJK_set_full_match(word): chars.extend(word) else: non_chinese = u"" for char in word: if is_from_CJK_set_single_character_match(char): if non_chinese: chars.append(non_chinese) non_chinese = u"" chars.append(char) else: non_chinese = non_chinese + char if non_chinese: chars.append(non_chinese) clean_dict = {} for c in chars: clean_dict[c] = 1 chars = [c.encode("utf8") for c in clean_dict.keys()] return chars else: return self.tokenize_for_words_default(phrase) def tokenize_for_pairs(self, phrase): return [] def tokenize_for_phrases(self, phrase): return [] diff --git a/invenio/modules/indexer/tokenizers/BibIndexDOITokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexDOITokenizer.py index 3344d9b48..b939e69ec 100644 --- a/invenio/modules/indexer/tokenizers/BibIndexDOITokenizer.py +++ b/invenio/modules/indexer/tokenizers/BibIndexDOITokenizer.py @@ -1,53 +1,53 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2013 CERN. +# Copyright (C) 2013, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. from invenio.modules.indexer.tokenizers.BibIndexFilteringTokenizer import BibIndexFilteringTokenizer class BibIndexDOITokenizer(BibIndexFilteringTokenizer): """ Filtering tokenizer which tokenizes DOI tag (0247_a) only if "0247_2" tag is present and its value equals "DOI" and 909C4a tag without any constraints. """ def __init__(self, stemming_language=None, remove_stopwords=False, remove_html_markup=False, remove_latex_markup=False): self.rules = (('0247_a', '2', 'DOI'), ('909C4a', '', '')) def get_tokenizing_function(self, wordtable_type): """Returns proper tokenizing function""" return self.tokenize def tokenize_via_recjson(self, recID): """ Nonmarc version of tokenize function for DOI. Note: with nonmarc we don't need to filter anymore. We just need to take value from record because we use bibfield here. """ rec = get_record(recID) values = rec.get('doi', []) return values def get_nonmarc_tokenizing_function(self, table_type): """ Returns proper tokenizing function for non-marc records. """ return self.tokenize_via_recjson diff --git a/invenio/modules/indexer/tokenizers/BibIndexDefaultTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexDefaultTokenizer.py index cc8bdae84..530f0b93e 100644 --- a/invenio/modules/indexer/tokenizers/BibIndexDefaultTokenizer.py +++ b/invenio/modules/indexer/tokenizers/BibIndexDefaultTokenizer.py @@ -1,173 +1,173 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012, 2014 CERN. +# Copyright (C) 2010, 2011, 2012, 2014, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """BibIndexDefaultTokenizer: useful for almost all indexes. It performs standard tokenization. It splits phrases into words/pairs or doesnt split at all, strips accents, removes alphanumeric characters and html and latex markup if we want to. Also can stem words for a given language. """ from invenio.legacy.bibindex.engine_config import \ CFG_BIBINDEX_INDEX_TABLE_TYPE from invenio.utils.html import remove_html_markup from invenio.utils.text import wash_for_utf8, strip_accents from invenio.legacy.bibindex.engine_washer import \ lower_index_term, remove_latex_markup, \ apply_stemming, remove_stopwords, length_check from invenio.legacy.bibindex.engine_utils import latex_formula_re, \ re_block_punctuation_begin, \ re_block_punctuation_end, \ re_punctuation, \ re_separators, \ re_arxiv from invenio.modules.indexer.tokenizers.BibIndexStringTokenizer import BibIndexStringTokenizer class BibIndexDefaultTokenizer(BibIndexStringTokenizer): """ It's a standard tokenizer. It is useful for most of the indexes. Its behaviour depends on stemming, remove stopwords, remove html markup and remove latex markup parameters. """ def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): """initialization""" self.stemming_language = stemming_language self.remove_stopwords = remove_stopwords self.remove_html_markup = remove_html_markup self.remove_latex_markup = remove_latex_markup def get_tokenizing_function(self, wordtable_type): """Picks correct tokenize_for_xxx function depending on type of tokenization (wordtable_type)""" if wordtable_type == CFG_BIBINDEX_INDEX_TABLE_TYPE["Words"]: return self.tokenize_for_words elif wordtable_type == CFG_BIBINDEX_INDEX_TABLE_TYPE["Pairs"]: return self.tokenize_for_pairs elif wordtable_type == CFG_BIBINDEX_INDEX_TABLE_TYPE["Phrases"]: return self.tokenize_for_phrases def tokenize_for_words(self, phrase): """Return list of words found in PHRASE. Note that the phrase is split into groups depending on the alphanumeric characters and punctuation characters definition present in the config file. """ words = {} formulas = [] if self.remove_html_markup and phrase.find(" -1: phrase = remove_html_markup(phrase) if self.remove_latex_markup: formulas = latex_formula_re.findall(phrase) phrase = remove_latex_markup(phrase) phrase = latex_formula_re.sub(' ', phrase) phrase = wash_for_utf8(phrase) phrase = lower_index_term(phrase) # 1st split phrase into blocks according to whitespace for block in strip_accents(phrase).split(): # 2nd remove leading/trailing punctuation and add block: block = re_block_punctuation_begin.sub("", block) block = re_block_punctuation_end.sub("", block) if block: stemmed_block = remove_stopwords(block, self.remove_stopwords) stemmed_block = length_check(stemmed_block) stemmed_block = apply_stemming(stemmed_block, self.stemming_language) if stemmed_block: words[stemmed_block] = 1 if re_arxiv.match(block): # special case for blocks like `arXiv:1007.5048' where # we would like to index the part after the colon # regardless of dot or other punctuation characters: words[block.split(':', 1)[1]] = 1 # 3rd break each block into subblocks according to punctuation and add subblocks: for subblock in re_punctuation.split(block): stemmed_subblock = remove_stopwords(subblock, self.remove_stopwords) stemmed_subblock = length_check(stemmed_subblock) stemmed_subblock = apply_stemming(stemmed_subblock, self.stemming_language) if stemmed_subblock: words[stemmed_subblock] = 1 # 4th break each subblock into alphanumeric groups and add groups: for alphanumeric_group in re_separators.split(subblock): stemmed_alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords) stemmed_alphanumeric_group = length_check(stemmed_alphanumeric_group) stemmed_alphanumeric_group = apply_stemming(stemmed_alphanumeric_group, self.stemming_language) if stemmed_alphanumeric_group: words[stemmed_alphanumeric_group] = 1 for block in formulas: words[block] = 1 return words.keys() def tokenize_for_pairs(self, phrase): """Return list of words found in PHRASE. Note that the phrase is split into groups depending on the alphanumeric characters and punctuation characters definition present in the config file. """ words = {} if self.remove_html_markup and phrase.find(" -1: phrase = remove_html_markup(phrase) if self.remove_latex_markup: phrase = remove_latex_markup(phrase) phrase = latex_formula_re.sub(' ', phrase) phrase = wash_for_utf8(phrase) phrase = lower_index_term(phrase) # 1st split phrase into blocks according to whitespace last_word = '' for block in strip_accents(phrase).split(): # 2nd remove leading/trailing punctuation and add block: block = re_block_punctuation_begin.sub("", block) block = re_block_punctuation_end.sub("", block) if block: block = remove_stopwords(block, self.remove_stopwords) block = length_check(block) block = apply_stemming(block, self.stemming_language) # 3rd break each block into subblocks according to punctuation and add subblocks: for subblock in re_punctuation.split(block): subblock = remove_stopwords(subblock, self.remove_stopwords) subblock = length_check(subblock) subblock = apply_stemming(subblock, self.stemming_language) if subblock: # 4th break each subblock into alphanumeric groups and add groups: for alphanumeric_group in re_separators.split(subblock): alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords) alphanumeric_group = length_check(alphanumeric_group) alphanumeric_group = apply_stemming(alphanumeric_group, self.stemming_language) if alphanumeric_group: if last_word: words['%s %s' % (last_word, alphanumeric_group)] = 1 last_word = alphanumeric_group return words.keys() def tokenize_for_phrases(self, phrase): """Return list of phrases found in PHRASE. Note that the phrase is split into groups depending on the alphanumeric characters and punctuation characters definition present in the config file. """ phrase = wash_for_utf8(phrase) return [phrase] def get_nonmarc_tokenizing_function(self, table_type): """ Picks correct tokenize_for_xxx function depending on the type of tokenization for non-marc standards. """ return self.get_tokenizing_function(table_type) diff --git a/invenio/modules/indexer/tokenizers/BibIndexEmptyTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexEmptyTokenizer.py index 4e87d3c50..1508a6bab 100644 --- a/invenio/modules/indexer/tokenizers/BibIndexEmptyTokenizer.py +++ b/invenio/modules/indexer/tokenizers/BibIndexEmptyTokenizer.py @@ -1,73 +1,73 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012, 2014 CERN. +# Copyright (C) 2010, 2011, 2012, 2014, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ BibIndexEmptyTokenizer. It's a really lazy tokenizer and doesn't do anything. """ from invenio.legacy.bibindex.engine_config import CFG_BIBINDEX_INDEX_TABLE_TYPE from invenio.modules.indexer.tokenizers.BibIndexStringTokenizer import BibIndexStringTokenizer class BibIndexEmptyTokenizer(BibIndexStringTokenizer): """ BibIndexEmptyTokenizer doesn't do anything. Irrespective of input to tokenizing function it always returns empty list. Can be used in some default cases or when we want to turn off specific index. """ def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): """@param stemming_language: dummy @param remove_stopwords: dummy @param remove_html_markup: dummy @param remove_latex_markup: dummy """ pass def get_tokenizing_function(self, wordtable_type): """Picks correct tokenize_for_xxx function depending on type of tokenization (wordtable_type)""" if wordtable_type == CFG_BIBINDEX_INDEX_TABLE_TYPE["Words"]: return self.tokenize_for_words elif wordtable_type == CFG_BIBINDEX_INDEX_TABLE_TYPE["Pairs"]: return self.tokenize_for_pairs elif wordtable_type == CFG_BIBINDEX_INDEX_TABLE_TYPE["Phrases"]: return self.tokenize_for_phrases def get_nonmarc_tokenizing_function(self, table_type): """ Picks correct tokenize_for_xxx function depending on the type of tokenization for non-marc standards. """ return self.get_tokenizing_function(table_type) def tokenize_for_words(self, phrase): return [] def tokenize_for_pairs(self, phrase): return [] def tokenize_for_phrases(self, phrase): return [] diff --git a/invenio/modules/indexer/tokenizers/BibIndexExactAuthorTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexExactAuthorTokenizer.py index e111b5292..2c7f0a851 100644 --- a/invenio/modules/indexer/tokenizers/BibIndexExactAuthorTokenizer.py +++ b/invenio/modules/indexer/tokenizers/BibIndexExactAuthorTokenizer.py @@ -1,44 +1,44 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012 CERN. +# Copyright (C) 2010, 2011, 2012, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """BibIndexExactAuthorTokenizer: performs only washing on author name and leaves it alone in the same form. """ from invenio.legacy.bibindex.engine_washer import wash_author_name from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import BibIndexDefaultTokenizer class BibIndexExactAuthorTokenizer(BibIndexDefaultTokenizer): """ Human name exact tokenizer. Old: BibIndexExactNameTokenizer """ def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): BibIndexDefaultTokenizer.__init__(self, stemming_language, remove_stopwords, remove_html_markup, remove_latex_markup) def tokenize_for_phrases(self, s): """ Returns washed autor name. """ return [wash_author_name(s)] diff --git a/invenio/modules/indexer/tokenizers/BibIndexFilenameTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexFilenameTokenizer.py index 313a95a87..9aa52d3c4 100644 --- a/invenio/modules/indexer/tokenizers/BibIndexFilenameTokenizer.py +++ b/invenio/modules/indexer/tokenizers/BibIndexFilenameTokenizer.py @@ -1,69 +1,69 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012, 2013 CERN. +# Copyright (C) 2010, 2011, 2012, 2013, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """BibIndexFilenameTokenizer: 'tokenizes' finds file names. Tokenizer is adapted to work with bibfield and its get_record function. """ from invenio.modules.indexer.tokenizers.BibIndexRecJsonTokenizer import BibIndexRecJsonTokenizer class BibIndexFilenameTokenizer(BibIndexRecJsonTokenizer): """ Tokenizes for file names. Tokenizer is adapted to work with bibfield and its get_record function. It accepts as an input a record created by a get_record function: from bibfield import get_record record16 = get_record(16) tokenizer = BibIndexFilenameTokenizer() new_words = tokenizer.tokenize(record16) Example of new_words: 'thesis.ps.gz' -> ['thesis', 'thesis.ps', 'thesis.ps.gz'] """ def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): pass def tokenize(self, record): """'record' is a recjson record from bibfield. Function uses derived field 'filenames' from the record. @param urls: recjson record """ values = [] try: if 'filenames' in record: values = record['filenames'] except KeyError: pass except TypeError: return [] return values def get_tokenizing_function(self, wordtable_type): return self.tokenize diff --git a/invenio/modules/indexer/tokenizers/BibIndexFiletypeTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexFiletypeTokenizer.py index c5b17b7ad..388d3ddd8 100644 --- a/invenio/modules/indexer/tokenizers/BibIndexFiletypeTokenizer.py +++ b/invenio/modules/indexer/tokenizers/BibIndexFiletypeTokenizer.py @@ -1,71 +1,71 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012, 2013, 2014 CERN. +# Copyright (C) 2010, 2011, 2012, 2013, 2014, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """BibIndexFiletypeTokenizer: 'tokenizes' for file extensions. Tokenizer is adapted to work with recjson and its get_record function. """ from invenio.modules.indexer.tokenizers.BibIndexRecJsonTokenizer import BibIndexRecJsonTokenizer class BibIndexFiletypeTokenizer(BibIndexRecJsonTokenizer): """ Tokenizes for file extensions. Tokenizer is adapted to work with recjson and its get_record function. It accepts as an input a record created by a get_record function: from invenio.modules.records.api import get_record record16 = get_record(16) tokenizer = BibIndexFiletypeTokenizer() new_words = tokenizer.tokenize(record16) """ def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): pass def tokenize(self, record): """'record' is a recjson record. Function uses derived field 'filetypes' from the record. @param urls: recjson record """ values = [] try: if 'filetypes' in record: values = record['filetypes'] except KeyError: pass except TypeError: return [] return values def tokenize_for_words(self, record): return self.tokenize(record) def tokenize_for_pairs(self, record): return self.tokenize(record) def tokenize_for_phrases(self, record): return self.tokenize(record) def get_tokenizing_function(self, wordtable_type): return self.tokenize diff --git a/invenio/modules/indexer/tokenizers/BibIndexFilteringTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexFilteringTokenizer.py index 1e7afebe0..18704c32c 100644 --- a/invenio/modules/indexer/tokenizers/BibIndexFilteringTokenizer.py +++ b/invenio/modules/indexer/tokenizers/BibIndexFilteringTokenizer.py @@ -1,85 +1,85 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2013, 2014 CERN. +# Copyright (C) 2013, 2014, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. from invenio.legacy.search_engine import get_record from invenio.legacy.bibrecord import record_get_field_instances from invenio.modules.indexer.tokenizers.BibIndexMultiFieldTokenizer import BibIndexMultiFieldTokenizer class BibIndexFilteringTokenizer(BibIndexMultiFieldTokenizer): """ This tokenizer would tokenize phrases from tag only if another tag was present in the record's metadata, for example it would tokenize phrases from 100__a only if 100__u was found in the record's metadata. This tokenizer is abstract and it shouldn't be used for indexes. Insted of using this tokenizer one can create another tokenizer iheriting after this one. To create new tokenizer based on BibIndexFilteringTokenizer you need to specify rules of tokenizing in self.rules property. Examples: 1) Let's say we want to tokenize data only from 100__a if 100__u is present: set: self.rules = (('100__a', 'u', ''),) 2) We want to tokenize data from '0247_a' if '0247_2' == 'DOI': set: self.rules = (('0247_2', '2', 'DOI'),) 3) We want to tokenize data from '0247_a' if '0247_2' == 'DOI' and all data from '100__a' with no constraints: set: self.rules = (('0247_2', '2', 'DOI'), ('100__a', '', '')) Definition of 'rules' tuple: (tag_to_take_phrases_from, value_of_sub_tag or '', necessary_value_of_sub_tag or '') Note: there is no get_tokenizing_function() to make this tokenizer abstract. """ def __init__(self, stemming_language=None, remove_stopwords=False, remove_html_markup=False, remove_latex_markup=False): self.rules = () def tokenize(self, recID): phrases = [] try: rec = get_record(recID) for rule in self.rules: tag_to_index, necessary_tag, necessary_value = rule core_tag = tag_to_index[0:3] ind = tag_to_index[3:5] sub_tag = tag_to_index[5] fields = [dict(instance[0]) for instance in record_get_field_instances(rec, core_tag, ind[0], ind[1])] for field in fields: tag_condition = necessary_tag and field.has_key(necessary_tag) or necessary_tag == '' value_condition = necessary_value and field.get(necessary_tag, '') == necessary_value or \ necessary_value == '' if tag_condition and field.has_key(sub_tag) and value_condition: phrases.append(field[sub_tag]) return phrases except KeyError: return [] return phrases def tokenize_via_recjson(self, recID): """ TODO: implementation needs to be introduced in order to work with non-marc standards. """ return [] diff --git a/invenio/modules/indexer/tokenizers/BibIndexFulltextTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexFulltextTokenizer.py index 2159e3163..86ff9526d 100644 --- a/invenio/modules/indexer/tokenizers/BibIndexFulltextTokenizer.py +++ b/invenio/modules/indexer/tokenizers/BibIndexFulltextTokenizer.py @@ -1,194 +1,194 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012, 2014 CERN. +# Copyright (C) 2010, 2011, 2012, 2014, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """BibIndexFulltextTokenizer: extracts words form a given document. Document is given by its URL. """ import os import sys import logging import urllib2 import re from six import iteritems from invenio.config import \ CFG_SOLR_URL, \ CFG_XAPIAN_ENABLED, \ CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY, \ CFG_BIBINDEX_SPLASH_PAGES from invenio.utils.html import get_links_in_html_page from invenio.legacy.websubmit.file_converter import convert_file, get_file_converter_logger from invenio.legacy.miscutil.solrutils_bibindex_indexer import solr_add_fulltext from invenio.legacy.miscutil.xapianutils_bibindex_indexer import xapian_add from invenio.legacy.bibdocfile.api import bibdocfile_url_p, \ bibdocfile_url_to_bibdoc, download_url, \ BibRecDocs, InvenioBibDocFileError from invenio.legacy.bibindex.engine_utils import get_idx_indexer from invenio.legacy.bibsched.bibtask import write_message from invenio.ext.logging import register_exception from intbitset import intbitset from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import BibIndexDefaultTokenizer fulltext_added = intbitset() # stores ids of records whose fulltexts have been added class BibIndexFulltextTokenizer(BibIndexDefaultTokenizer): """ Exctracts all the words contained in document specified by url. """ def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): self.verbose = 3 BibIndexDefaultTokenizer.__init__(self, stemming_language, remove_stopwords, remove_html_markup, remove_latex_markup) def set_verbose(self, verbose): """Allows to change verbosity level during indexing""" self.verbose = verbose def tokenize_for_words_default(self, phrase): """Default tokenize_for_words inherited from default tokenizer""" return super(BibIndexFulltextTokenizer, self).tokenize_for_words(phrase) def get_words_from_fulltext(self, url_direct_or_indirect): """Returns all the words contained in the document specified by URL_DIRECT_OR_INDIRECT with the words being split by various SRE_SEPARATORS regexp set earlier. If FORCE_FILE_EXTENSION is set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF file. (This is interesting to index Indico for example.) Note also that URL_DIRECT_OR_INDIRECT may be either a direct URL to the fulltext file or an URL to a setlink-like page body that presents the links to be indexed. In the latter case the URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs to fulltext documents, for all knows file extensions as specified by global CONV_PROGRAMS config variable. """ write_message("... reading fulltext files from %s started" % url_direct_or_indirect, verbose=2) try: if bibdocfile_url_p(url_direct_or_indirect): write_message("... %s is an internal document" % url_direct_or_indirect, verbose=2) try: bibdoc = bibdocfile_url_to_bibdoc(url_direct_or_indirect) except InvenioBibDocFileError: # Outdated 8564 tag return [] indexer = get_idx_indexer('fulltext') if indexer != 'native': # A document might belong to multiple records for rec_link in bibdoc.bibrec_links: recid = rec_link["recid"] # Adds fulltexts of all files once per records if not recid in fulltext_added: bibrecdocs = BibRecDocs(recid) try: text = bibrecdocs.get_text() except InvenioBibDocFileError: # Invalid PDF continue if indexer == 'SOLR' and CFG_SOLR_URL: solr_add_fulltext(recid, text) elif indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED: xapian_add(recid, 'fulltext', text) fulltext_added.add(recid) # we are relying on an external information retrieval system # to provide full-text indexing, so dispatch text to it and # return nothing here: return [] else: text = "" if hasattr(bibdoc, "get_text"): text = bibdoc.get_text() return self.tokenize_for_words_default(text) else: if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY: write_message("... %s is external URL but indexing only local files" % url_direct_or_indirect, verbose=2) return [] write_message("... %s is an external URL" % url_direct_or_indirect, verbose=2) urls_to_index = set() for splash_re, url_re in iteritems(CFG_BIBINDEX_SPLASH_PAGES): if re.match(splash_re, url_direct_or_indirect): write_message("... %s is a splash page (%s)" % (url_direct_or_indirect, splash_re), verbose=2) html = urllib2.urlopen(url_direct_or_indirect).read() urls = get_links_in_html_page(html) write_message("... found these URLs in %s splash page: %s" % (url_direct_or_indirect, ", ".join(urls)), verbose=3) for url in urls: if re.match(url_re, url): write_message("... will index %s (matched by %s)" % (url, url_re), verbose=2) urls_to_index.add(url) if not urls_to_index: urls_to_index.add(url_direct_or_indirect) write_message("... will extract words from %s" % ', '.join(urls_to_index), verbose=2) words = {} for url in urls_to_index: tmpdoc = download_url(url) file_converter_logger = get_file_converter_logger() old_logging_level = file_converter_logger.getEffectiveLevel() if self.verbose > 3: file_converter_logger.setLevel(logging.DEBUG) try: try: tmptext = convert_file(tmpdoc, output_format='.txt') text = open(tmptext).read() os.remove(tmptext) indexer = get_idx_indexer('fulltext') if indexer != 'native': if indexer == 'SOLR' and CFG_SOLR_URL: solr_add_fulltext(None, text) # FIXME: use real record ID if indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED: #xapian_add(None, 'fulltext', text) # FIXME: use real record ID pass # we are relying on an external information retrieval system # to provide full-text indexing, so dispatch text to it and # return nothing here: tmpwords = [] else: tmpwords = self.tokenize_for_words_default(text) words.update(dict(map(lambda x: (x, 1), tmpwords))) except Exception as e: message = 'ERROR: it\'s impossible to correctly extract words from %s referenced by %s: %s' % (url, url_direct_or_indirect, e) register_exception(prefix=message, alert_admin=True) write_message(message, stream=sys.stderr) finally: os.remove(tmpdoc) if self.verbose > 3: file_converter_logger.setLevel(old_logging_level) return words.keys() except Exception as e: message = 'ERROR: it\'s impossible to correctly extract words from %s: %s' % (url_direct_or_indirect, e) register_exception(prefix=message, alert_admin=True) write_message(message, stream=sys.stderr) return [] def tokenize_for_words(self, phrase): return self.get_words_from_fulltext(phrase) def tokenize_for_pairs(self, phrase): return [] def tokenize_for_phrases(self, phrase): return [] diff --git a/invenio/modules/indexer/tokenizers/BibIndexItemCountTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexItemCountTokenizer.py index 9420450fe..61db49611 100644 --- a/invenio/modules/indexer/tokenizers/BibIndexItemCountTokenizer.py +++ b/invenio/modules/indexer/tokenizers/BibIndexItemCountTokenizer.py @@ -1,56 +1,56 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012, 2014 CERN. +# Copyright (C) 2010, 2011, 2012, 2014, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """BibIndexItemCountTokenizer: counts the number of copies of a book which is owned by the library in the real world. """ from invenio.modules.indexer.tokenizers.BibIndexRecJsonTokenizer import BibIndexRecJsonTokenizer class BibIndexItemCountTokenizer(BibIndexRecJsonTokenizer): """ Returns a number of copies of a book which is owned by the library. """ def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): pass def tokenize(self, record): """Tokenizes for number of copies of a book in the 'real' library""" count = 0 try: count = record['number_of_copies'] except KeyError: pass except TypeError: return [] return [str(count)] def tokenize_for_words(self, record): return self.tokenize(record) def tokenize_for_pairs(self, record): return self.tokenize(record) def tokenize_for_phrases(self, record): return self.tokenize(record) def get_tokenizing_function(self, wordtable_type): return self.tokenize diff --git a/invenio/modules/indexer/tokenizers/BibIndexJournalTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexJournalTokenizer.py index b6e1b1059..a61d4546c 100644 --- a/invenio/modules/indexer/tokenizers/BibIndexJournalTokenizer.py +++ b/invenio/modules/indexer/tokenizers/BibIndexJournalTokenizer.py @@ -1,141 +1,141 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012, 2014 CERN. +# Copyright (C) 2010, 2011, 2012, 2014, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """BibIndexJournalTokenizer: useful for journal index. Agregates info about journal in a specific way given by its variable journal_pubinfo_standard_form. Behaves in the same way for all index table types: - Words - Pairs - Phrases """ from invenio.legacy.dbquery import run_sql from invenio.modules.indexer.tokenizers.BibIndexMultiFieldTokenizer import BibIndexMultiFieldTokenizer from invenio.config import \ CFG_CERN_SITE, \ CFG_INSPIRE_SITE from invenio.legacy.bibindex.engine_utils import get_values_recursively from invenio.modules.records.api import get_record if CFG_CERN_SITE: CFG_JOURNAL_TAG = '773__%' CFG_JOURNAL_PUBINFO_STANDARD_FORM = "773__p 773__v (773__y) 773__c" CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK = r'^\w.*\s\w.*\s\(\d+\)\s\w.*$' elif CFG_INSPIRE_SITE: CFG_JOURNAL_TAG = '773__%' CFG_JOURNAL_PUBINFO_STANDARD_FORM = "773__p,773__v,773__c" CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK = r'^\w.*,\w.*,\w.*$' else: CFG_JOURNAL_TAG = '909C4%' CFG_JOURNAL_PUBINFO_STANDARD_FORM = "909C4p 909C4v (909C4y) 909C4c" CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK = r'^\w.*\s\w.*\s\(\d+\)\s\w.*$' class BibIndexJournalTokenizer(BibIndexMultiFieldTokenizer): """ Tokenizer for journal index. Returns joined title/volume/year/page as a word from journal tag. Tokenizer works on multiple tags. For more information on tokenizers working on per-record basis take a look on BibIndexJournalTokenizer base class. """ def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): self.tag = CFG_JOURNAL_TAG self.nonmarc_tag = 'journal_info' self.journal_pubinfo_standard_form = CFG_JOURNAL_PUBINFO_STANDARD_FORM self.journal_pubinfo_standard_form_regexp_check = CFG_JOURNAL_PUBINFO_STANDARD_FORM_REGEXP_CHECK def tokenize(self, recID): """ Special procedure to extract words from journal tags. Joins title/volume/year/page into a standard form that is also used for citations. """ # get all journal tags/subfields: bibXXx = "bib" + self.tag[0] + self.tag[1] + "x" bibrec_bibXXx = "bibrec_" + bibXXx query = """SELECT bb.field_number,b.tag,b.value FROM %s AS b, %s AS bb WHERE bb.id_bibrec=%%s AND bb.id_bibxxx=b.id AND tag LIKE %%s""" % (bibXXx, bibrec_bibXXx) res = run_sql(query, (recID, self.tag)) # construct journal pubinfo: dpubinfos = {} for row in res: nb_instance, subfield, value = row if subfield.endswith("c"): # delete pageend if value is pagestart-pageend # FIXME: pages may not be in 'c' subfield value = value.split('-', 1)[0] if nb_instance in dpubinfos: dpubinfos[nb_instance][subfield] = value else: dpubinfos[nb_instance] = {subfield: value} # construct standard format: lwords = [] for dpubinfo in dpubinfos.values(): # index all journal subfields separately for tag, val in dpubinfo.items(): lwords.append(val) # index journal standard format: pubinfo = self.journal_pubinfo_standard_form for tag, val in dpubinfo.items(): pubinfo = pubinfo.replace(tag, val) if self.tag[:-1] in pubinfo: # some subfield was missing, do nothing pass else: lwords.append(pubinfo) # return list of words and pubinfos: return lwords def tokenize_via_recjson(self, recID): """ Tokenizes for journal info. Uses bibfield. """ phrases = [] rec = get_record(recID) recjson_field = rec.get(self.nonmarc_tag) get_values_recursively(recjson_field, phrases) final = [] append = final.append for phrase in phrases: info = phrase.split("-", 1) append(info[0]) return final def tokenize_for_words(self, recID): return self.tokenize(recID) def tokenize_for_pairs(self, recID): return self.tokenize(recID) def tokenize_for_phrases(self, recID): return self.tokenize(recID) def get_tokenizing_function(self, wordtable_type): return self.tokenize def get_nonmarc_tokenizing_function(self, table_type): return self.tokenize_via_recjson diff --git a/invenio/modules/indexer/tokenizers/BibIndexMultiFieldTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexMultiFieldTokenizer.py index 2a59e9313..5a73428f6 100644 --- a/invenio/modules/indexer/tokenizers/BibIndexMultiFieldTokenizer.py +++ b/invenio/modules/indexer/tokenizers/BibIndexMultiFieldTokenizer.py @@ -1,81 +1,81 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012 CERN. +# Copyright (C) 2010, 2011, 2012, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ BibIndexMultiFieldTokenizer. Base class for tokenizers that work on more than one field and possibly on more than one phrase at a time. """ from invenio.legacy.bibindex.engine_config import CFG_BIBINDEX_INDEX_TABLE_TYPE from invenio.modules.indexer.tokenizers.BibIndexTokenizer import BibIndexTokenizer class BibIndexMultiFieldTokenizer(BibIndexTokenizer): """ BibIndexMultiFieldTokenizer is an abstract tokenizer. It should be used only for inheritance. This tokenizer should be a base class for more complicated tokenizers which tokenizing functions perform calculations on per record basis and NOT per string basis (look for BibIndexDefaultTokenizer if you want to know more about the latter type of tokenization). Tokenizing functions take as an argument recID of the record we want to perform calculations on. Example: class BibIndexComplicatedTokenizer(BibIndexMultiFieldTokenizer): (...) recID = 10 a = BibIndexComplicatedTokenizer() res = a.tokenize_for_words(recID) Good examples of MultiFieldTokenizer are JournalTokenizer and AuthorCountTokenizer. Both return results after processing more than one field/tag of the record (for more information check these tokenizers). """ def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): """@param stemming_language: dummy @param remove_stopwords: dummy @param remove_html_markup: dummy @param remove_latex_markup: dummy """ pass def get_tokenizing_function(self, wordtable_type): """Picks correct tokenize_for_xxx function depending on type of tokenization (wordtable_type)""" raise NotImplementedError def tokenize_for_words(self, recid): raise NotImplementedError def tokenize_for_pairs(self, recid): raise NotImplementedError def tokenize_for_phrases(self, recid): raise NotImplementedError diff --git a/invenio/modules/indexer/tokenizers/BibIndexRecJsonTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexRecJsonTokenizer.py index 6e840199c..160137e8e 100644 --- a/invenio/modules/indexer/tokenizers/BibIndexRecJsonTokenizer.py +++ b/invenio/modules/indexer/tokenizers/BibIndexRecJsonTokenizer.py @@ -1,70 +1,70 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2014 CERN. +# Copyright (C) 2014, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ BibIndexRecJsonTokenizer. It's an abstract class created only for inheritance purposes. Tokenizers which are based on RecJsonTokenizer use bibfield/JsonAlchemy records. Logic of the tokenization process is in the functions of bibfield module. Tokenizer itself should perform only necessary post-processing. """ from invenio.modules.indexer.tokenizers.BibIndexTokenizer import BibIndexTokenizer class BibIndexRecJsonTokenizer(BibIndexTokenizer): """ BibIndexRecJsonTokenizer is an abstract tokenizer. It should be used only for inheritance. It should be a base class for all tokenizers which need to use bibfield/JsonAlchemy records. Tokenizing function of RecJsonTokenizer takes a bibfield record as an argument. Main logic of tokenization process stays in bibfield record's functions. Tokenizing functions of all tokenizers inheriting after RecJsonTokenizer should only do post-processing tasks. For example of use please check: BibIndexFiletypeTokenizer """ def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): pass def tokenize(self, record): """'record' is a recjson record from bibfield module @param urls: recjson record """ raise NotImplementedError def get_tokenizing_function(self, wordtable_type): raise NotImplementedError def tokenize_for_words(self, recid): raise NotImplementedError def tokenize_for_pairs(self, recid): raise NotImplementedError def tokenize_for_phrases(self, recid): raise NotImplementedError diff --git a/invenio/modules/indexer/tokenizers/BibIndexStringTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexStringTokenizer.py index 69f54c34b..05c1dad3c 100644 --- a/invenio/modules/indexer/tokenizers/BibIndexStringTokenizer.py +++ b/invenio/modules/indexer/tokenizers/BibIndexStringTokenizer.py @@ -1,65 +1,65 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012 CERN. +# Copyright (C) 2010, 2011, 2012, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ Abstract BibIndexStringTokenizer. It is a tokenizer created only for inheritance. All string based tokenizers should inherit after this tokenizer. """ from invenio.modules.indexer.tokenizers.BibIndexTokenizer import BibIndexTokenizer class BibIndexStringTokenizer(BibIndexTokenizer): """ BibIndexStringTokenizer is an abstract tokenizer. It should be used only for inheritance. This tokenizer should be a base class for tokenizers which operates on strings/phrases and splits them into multiple terms/tokens. Tokenizing functions take phrase as an argument. Good examples of StringTokenizer is DeafultTokenizer. """ def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): """@param stemming_language: dummy @param remove_stopwords: dummy @param remove_html_markup: dummy @param remove_latex_markup: dummy """ pass def get_tokenizing_function(self, wordtable_type): """Picks correct tokenize_for_xxx function depending on type of tokenization (wordtable_type)""" raise NotImplementedError def tokenize_for_words(self, phrase): raise NotImplementedError def tokenize_for_pairs(self, phrase): raise NotImplementedError def tokenize_for_phrases(self, phrase): raise NotImplementedError diff --git a/invenio/modules/indexer/tokenizers/BibIndexTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexTokenizer.py index 3cb71e63f..bc06ea4ae 100644 --- a/invenio/modules/indexer/tokenizers/BibIndexTokenizer.py +++ b/invenio/modules/indexer/tokenizers/BibIndexTokenizer.py @@ -1,176 +1,176 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012 CERN. +# Copyright (C) 2010, 2011, 2012, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ BibIndexTokenizer: generic, not implemented tokenizer for inheritance Inheritance tree for tokenizers in Invenio: BibIndexTokenizer ^ | |----BibIndexStringTokenizer<---| | | | BibIndexDefaultTokenizer<---| | | | BibIndexAuthorTokenizer | BibIndexExactAuthorTokenizer | (...) | |----BibIndexRecJsonTokenizer<---| | | | BibIndexFiletypeTokenizer | (...) | |----BibIndexMultiFieldTokenizer<---| | BibIndexJournalTokenizer BibIndexAuthorCountTokenizer (...) """ class BibIndexTokenizer(object): """ Base class for the tokenizers. Tokenizers are components that find terms which need to be indexed and stored in DB. Different types of tokenizers work in different ways. Tokenizers are divided into three groups: - tokenizers that take string as an input and split it into tokens/terms which later are indexed - tokenizers that take recID of the record and find terms by processing many fields/tags from the record - tokenizers that use bibfield module and their functions which precomputes terms to index """ #words part def scan_string_for_words(self, s): """Return an intermediate representation of the tokens in s. Every tokenizer should have a scan_string function, which scans the input string and lexically tags its components. These units are grouped together sequentially. The output of scan_string is usually something like: { 'TOKEN_TAG_LIST' : a list of valid keys in this output set, 'key1' : [val1, val2, val3] - where key describes the in some meaningful way } @param s: the input to be lexically tagged @type s: string @return: dict of lexically tagged input items In a sample Tokenizer where scan_string simply splits s on space, scan_string might output the following for "Assam and Darjeeling": { 'TOKEN_TAG_LIST' : 'word_list', 'word_list' : ['Assam', 'and', 'Darjeeling'] } @rtype: dict """ raise NotImplementedError def parse_scanned_for_words(self, o): """Calculate the token list from the intermediate representation o. While this should be an interesting computation over the intermediate representation generated by scan_string, obviously in the split-on- space example we need only return o['word_list']. @param t: a dictionary with a 'word_list' key @type t: dict @return: the token items from 'word_list' @rtype: list of string """ raise NotImplementedError def tokenize_for_words(self, s): """Main entry point. Return token list from input string s. Simply composes the functionality above. @param s: the input to be lexically tagged @type s: string @return: the token items derived from s @rtype: list of string """ raise NotImplementedError #pairs part def scan_string_for_pairs(self, s): """ See: scan_string_for_words """ raise NotImplementedError def parse_scanned_for_pairs(self, o): """ See: parse_scanned_for_words """ raise NotImplementedError def tokenize_for_pairs(self, s): """ See: tokenize_for_words """ raise NotImplementedError #phrases part def scan_string_for_phrases(self, s): """ See: scan_string_for_words """ raise NotImplementedError def parse_scanned_for_phrases(self, o): """ See: parse_scanned_for_words """ raise NotImplementedError def tokenize_for_phrases(self, s): """ See: tokenize_for_words """ raise NotImplementedError def get_tokenizing_function(self, wordtable_type): """Chooses tokenize_for_words, tokenize_for_phrases or tokenize_for_pairs depending on type of tokenization we want to perform.""" raise NotImplementedError def get_nonmarc_tokenizing_function(self, table_type): """Chooses best tokenizing function depending on type of tokenization we want to perform. Non-marc version. """ raise NotImplementedError @property def implemented(self): try: self.get_tokenizing_function("") except NotImplementedError: return False except AttributeError: return False return True @property def implemented_nonmarc(self): try: self.get_nonmarc_tokenizing_function("") except NotImplementedError: return False except AttributeError: return False - return True \ No newline at end of file + return True diff --git a/invenio/modules/indexer/tokenizers/BibIndexYearTokenizer.py b/invenio/modules/indexer/tokenizers/BibIndexYearTokenizer.py index 38cf6fb75..8d3b5b20c 100644 --- a/invenio/modules/indexer/tokenizers/BibIndexYearTokenizer.py +++ b/invenio/modules/indexer/tokenizers/BibIndexYearTokenizer.py @@ -1,71 +1,71 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012 CERN. +# Copyright (C) 2010, 2011, 2012, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """BibIndexYearTokenizer: useful for year index. Extracts words (year) from date tags. """ from invenio.config import \ CFG_INSPIRE_SITE from invenio.modules.indexer.tokenizers.BibIndexDefaultTokenizer import BibIndexDefaultTokenizer class BibIndexYearTokenizer(BibIndexDefaultTokenizer): """ Year tokenizer. It tokenizes words from date tags or uses default word tokenizer. """ def __init__(self, stemming_language = None, remove_stopwords = False, remove_html_markup = False, remove_latex_markup = False): BibIndexDefaultTokenizer.__init__(self, stemming_language, remove_stopwords, remove_html_markup, remove_latex_markup) def get_words_from_date_tag(self, datestring): """ Special procedure to index words from tags storing date-like information in format YYYY or YYYY-MM or YYYY-MM-DD. Namely, we are indexing word-terms YYYY, YYYY-MM, YYYY-MM-DD, but never standalone MM or DD. """ out = [] for dateword in datestring.split(): # maybe there are whitespaces, so break these too out.append(dateword) parts = dateword.split('-') for nb in range(1, len(parts)): out.append("-".join(parts[:nb])) return out def tokenize_for_words_default(self, phrase): """Default tokenize_for_words inherited from default tokenizer""" return super(BibIndexYearTokenizer, self).tokenize_for_words(phrase) def tokenize_for_words(self, phrase): """ If CFG_INSPIRE_SITE is 1 we perform special tokenization which relies on getting words form date tag. In other case we perform default tokenization. """ if CFG_INSPIRE_SITE: return self.get_words_from_date_tag(phrase) else: return self.tokenize_for_words_default(phrase) diff --git a/invenio/modules/records/recordext/functions/get_pubinfo_standard_form.py b/invenio/modules/records/recordext/functions/get_pubinfo_standard_form.py index cd07cd9f4..36383b1ae 100644 --- a/invenio/modules/records/recordext/functions/get_pubinfo_standard_form.py +++ b/invenio/modules/records/recordext/functions/get_pubinfo_standard_form.py @@ -1,27 +1,27 @@ -# -*- coding:utf-8 -*- +# -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2010, 2011, 2012 CERN. +# Copyright (C) 2010, 2011, 2012, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. def get_pubinfo_standard_form(pubinfo): """ """ if all(key in pubinfo for key in ('c', 'p', 'v', 'y', )): return '%s %s (%s) %s' % (pubinfo['p'], pubinfo['v'], pubinfo['y'], pubinfo['c'].split("-", 1)[0], ) else: - return None \ No newline at end of file + return None diff --git a/invenio/testsuite/test_legacy_dbquery.py b/invenio/testsuite/test_legacy_dbquery.py index 979c4535a..1625bd75d 100644 --- a/invenio/testsuite/test_legacy_dbquery.py +++ b/invenio/testsuite/test_legacy_dbquery.py @@ -1,111 +1,110 @@ - # -*- coding: utf-8 -*- # # This file is part of Invenio. -# Copyright (C) 2006, 2007, 2008, 2010, 2011, 2013 CERN. +# Copyright (C) 2006, 2007, 2008, 2010, 2011, 2013, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Unit tests for dbquery library.""" __revision__ = "$Id$" from invenio.base.wrappers import lazy_import from invenio.testsuite import make_test_suite, run_test_suite, InvenioTestCase dbquery = lazy_import('invenio.legacy.dbquery') class TableUpdateTimesTest(InvenioTestCase): """Test functions related to the update_times of MySQL tables.""" def _check_table_update_time(self, tablename): """Helper function to check update time of TABLENAME.""" from invenio.base.globals import cfg # detect MySQL version number: res = dbquery.run_sql("SELECT VERSION()") mysql_server_version = res[0][0] if mysql_server_version.startswith("5."): # MySQL-5 provides INFORMATION_SCHEMA: query = """SELECT UPDATE_TIME FROM INFORMATION_SCHEMA.TABLES WHERE table_name='%s' AND table_schema='%s'""" \ % (tablename, cfg['CFG_DATABASE_NAME']) tablename_update_time = str(dbquery.run_sql(query)[0][0]) elif mysql_server_version.startswith("4.1"): # MySQL-4.1 has it on 12th position: query = """SHOW TABLE STATUS LIKE '%s'""" % tablename tablename_update_time = str(dbquery.run_sql(query)[0][12]) elif mysql_server_version.startswith("4.0"): # MySQL-4.0 has it on 11th position: query = """SHOW TABLE STATUS LIKE '%s'""" % tablename tablename_update_time = str(dbquery.run_sql(query)[0][11]) else: tablename_update_time = "MYSQL SERVER VERSION NOT DETECTED" # compare it with the one detected by the function: self.assertEqual(tablename_update_time, dbquery.get_table_update_time(tablename)) def test_single_table_update_time(self): """dbquery - single table (with indexes) update time detection""" # NOTE: this tests usual "long" branch of # get_table_update_time() self._check_table_update_time("collection") def test_empty_table_update_time(self): """dbquery - empty table (no indexes) update time detection""" # NOTE: this tests unusual "None" branch of # get_table_update_time() # create empty test table test_table = "tmpTESTTABLE123" dbquery.run_sql("CREATE TABLE IF NOT EXISTS %s (a INT)" % test_table) # run the test: self._check_table_update_time(test_table) # drop empty test table dbquery.run_sql("DROP TABLE %s" % test_table) def test_utf8_python_mysqldb_mysql_storage_chain(self): """dbquery - UTF-8 in Python<->MySQLdb<->MySQL storage chain""" # NOTE: This test test creates, uses and destroys a temporary # table called "test__invenio__utf8". beta_in_utf8 = "β" # Greek beta in UTF-8 is 0xCEB2 dbquery.run_sql("CREATE TEMPORARY TABLE test__invenio__utf8 (x char(1), y varbinary(2)) DEFAULT CHARACTER SET utf8") dbquery.run_sql("INSERT INTO test__invenio__utf8 (x, y) VALUES (%s, %s)", (beta_in_utf8, beta_in_utf8)) res = dbquery.run_sql("SELECT x,y,HEX(x),HEX(y),LENGTH(x),LENGTH(y),CHAR_LENGTH(x),CHAR_LENGTH(y) FROM test__invenio__utf8") self.assertEqual(res[0], ('\xce\xb2', '\xce\xb2', 'CEB2', 'CEB2', 2L, 2L, 1L, 2L)) dbquery.run_sql("DROP TEMPORARY TABLE test__invenio__utf8") class WashTableColumnNameTest(InvenioTestCase): """Test if wash_table_column_name and real_escape_string evaluates correctly.""" def test_wash_table_column_name(self): """dbquery - wash table column name""" testcase_error = "foo ; bar" testcase_ok = "foo_bar" self.assertRaises(Exception, dbquery.wash_table_column_name, testcase_error) self.assertEqual(testcase_ok, dbquery.wash_table_column_name(testcase_ok)) def test_real_escape_string(self): """dbquery - real escape string""" testcase_ok = "Programmer" testcase_injection = "' OR ''='" self.assertEqual(dbquery.real_escape_string(testcase_ok), testcase_ok) self.assertNotEqual(dbquery.real_escape_string(testcase_injection), testcase_injection) TEST_SUITE = make_test_suite(TableUpdateTimesTest, WashTableColumnNameTest) if __name__ == "__main__": run_test_suite(TEST_SUITE)