diff --git a/MANIFEST.in b/MANIFEST.in index 9903da470..fbe047249 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,33 +1,33 @@ -include requirements*.txt ABOUT-NLS AUTHORS ChangeLog COPYING INSTALL NEWS README RELEASE-NOTES THANKS TODO UNINSTALL bower.json Gruntfile.js mapping.cfg package.json Procfile setup.cfg +include requirements*.txt ABOUT-NLS AUTHORS ChangeLog COPYING INSTALL NEWS README RELEASE-NOTES THANKS TODO UNINSTALL bower.json Gruntfile.js mapping.cfg package.json Procfile* setup.cfg include invenio/invenio.wsgi recursive-include docs * recursive-exclude docs/_build * recursive-include invenio/testsuite * recursive-include invenio/base/static * recursive-include invenio/base/templates * recursive-include invenio/base/translations *.mo *.po *.pot recursive-include invenio/modules/*/static * recursive-include invenio/modules/*/templates * recursive-include invenio/modules/*/testsuite * recursive-include invenio/modules/*/format_elements * recursive-include invenio/modules/*/format_templates * recursive-include invenio/modules/*/output_formats * recursive-include invenio/modules/*/converterext * recursive-include invenio/modules/*/documentext * recursive-include invenio/modules/*/editorext * recursive-include invenio/modules/*/exportext * recursive-include invenio/modules/*/jsonext * recursive-include invenio/modules/*/rankext * recursive-include invenio/modules/*/recordext/* * recursive-include invenio/modules/authorids/name_authority_files * recursive-include invenio/modules/encoder/scripts * recursive-include invenio/utils/data * recursive-include invenio/legacy * -recursive-exclude * *.py[co] \ No newline at end of file +recursive-exclude * *.py[co] diff --git a/Procfile-elasticsearch b/Procfile-elasticsearch new file mode 100644 index 000000000..039fea087 --- /dev/null +++ b/Procfile-elasticsearch @@ -0,0 +1,5 @@ +web: inveniomanage runserver +cache: redis-server +worker: celeryd -E -A invenio.celery.celery --loglevel=INFO --workdir=$VIRTUAL_ENV +workermon: flower --broker=redis://localhost:6379/1 +indexer: elasticsearch -D es.config=invenio/ext/elasticsearch/config/elasticsearch.yml diff --git a/docs/api.rst b/docs/api.rst index 9e94d4535..40d031b1d 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -1,41 +1,42 @@ .. _api: API === .. toctree:: :maxdepth: 1 base Extensions ---------- .. toctree:: :maxdepth: 1 ext/assets + ext/elasticsearch ext/sqlalchemy Modules ------- .. toctree:: :maxdepth: 1 modules/annotations modules/apikeys modules/documents modules/formatter modules/jsonalchemy modules/tags modules/upgrader modules/workflows Legacy ------ .. toctree:: :maxdepth: 1 legacy/bibrecord diff --git a/docs/ext/elasticsearch.rst b/docs/ext/elasticsearch.rst new file mode 100644 index 000000000..74b646411 --- /dev/null +++ b/docs/ext/elasticsearch.rst @@ -0,0 +1,5 @@ +Elasticsearch +============= + +.. automodule:: invenio.ext.elasticsearch + :members: diff --git a/invenio/ext/elasticsearch/__init__.py b/invenio/ext/elasticsearch/__init__.py new file mode 100644 index 000000000..c6a17c678 --- /dev/null +++ b/invenio/ext/elasticsearch/__init__.py @@ -0,0 +1,862 @@ +## +## This file is part of Invenio. +## Copyright (C) 2013, 2014 CERN. +## +## Invenio is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 2 of the +## License, or (at your option) any later version. +## +## Invenio is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Invenio; if not, write to the Free Software Foundation, Inc., +## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +""" Elasticsearch extension for Invenio. + +invenio.ext.elasticsearch +------------------------- +Elasticsearch a search engine for Invenio. + +It should be able to perform: + - metadata and fulltext search almost without DB + - metadata facets such as authors, Invenio collection facets both with the + corresponding filters + - fulltext and metadata fields highlightings + - fast collection indexing, mid-fast metadata indexing, almost fast fulltext + indexing + + +Requirements +^^^^^^^^^^^^ + +Elasticsearch >= 1.0, pyelasticsearch. + + +Installation (for developpement) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Download the tarball form http://elasticsearch.org, uncompress it and add the +bin/ directory in your path. +Go to the invenio src directory and run ``honcho start`` or equivalent. + + +Deployement +^^^^^^^^^^^ + +... + +Testing +^^^^^^^ + +This extension is working with the demosite. Indexing is done automagicaly +using webcoll/bibupload signals. Note: bibindex is not required. + +Usage +^^^^^ + >>> es = current_app.extensions.get("elasticsearch") + >>> res = es.search(query="title:Sneutrinos", + facet_filters=[("facet_authors", "Schael, S"), + ("facet_authors", "Bruneliere, R")]) + +see: `a simple of search interface <http://github.com/jma/elasticsearch_view>`_ +or invenio/ext/elasticsearch/scripts/test_es.py for a complete manual +indexing and searching example. + +TODO: + - adding exceptions + - decide if we create one ES document type for each JsonAlchemy document type + - convert an Invenio query into a ES query + - add sort options in JsonAchemy + - check collection access restriction with collection filters + - probably needs a collection exclusion list as search params + - Note: file access restriction is not managed by the indexer + - multi-lingual support (combo plugin, in config files) + - term boosting configuration (in JsonAlchemy?) + - search by marc field support? + - test hierachical collections + - test similar documents + - and many more... +""" +from werkzeug.utils import cached_property +from pyelasticsearch import ElasticSearch as PyElasticSearch + + +class ElasticSearch(object): + + """ + Flask extension. + + Initialization of the extension: + + >>> from flask import Flask + >>> from flask_elasticsearch import ElasticSearch + >>> app = Flask('myapp') + >>> s = ElasticSearch(app=app) + + or alternatively using the factory pattern: + + >>> app = Flask('myapp') + >>> s = ElasticSearch() + >>> s.init_app(app) + """ + + def __init__(self, app=None): + """Build the extension object.""" + self.app = app + + #default process functions + self.process_query = lambda x: x + self.process_results = lambda x: x + + # TODO: to put in config? + self.records_doc_type = "records" + self.documents_doc_type = "documents" + self.collections_doc_type = "collections" + + # to cache recids collections + self._recids_collections = {} + + if app is not None: + self.init_app(app) + + def init_app(self, app): + """ + Initialize a Flask application. + + Only one Registry per application is allowed. + """ + app.config.setdefault('ELASTICSEARCH_URL', 'http://localhost:9200/') + app.config.setdefault('ELASTICSEARCH_INDEX', "invenio") + app.config.setdefault('ELASTICSEARCH_NUMBER_OF_SHARDS', 1) + app.config.setdefault('ELASTICSEARCH_NUMBER_OF_REPLICAS', 0) + app.config.setdefault('ELASTICSEARCH_DATE_DETECTION', False) + app.config.setdefault('ELASTICSEARCH_NUMERIC_DETECTION', False) + app.config.setdefault('ELASTICSEARCH_ANALYSIS', { + "default": {"type": "simple"}}) + + # Follow the Flask guidelines on usage of app.extensions + if not hasattr(app, 'extensions'): + app.extensions = {} + if 'elasticsearch' in app.extensions: + raise Exception("Flask application already initialized") + + app.extensions['elasticsearch'] = self + self.app = app + + @cached_property + def connection(self): + """Return a pyelasticsearch connection object.""" + return PyElasticSearch(self.app.config['ELASTICSEARCH_URL']) + + def set_query_handler(self, handler): + """ + Specify a function to convert the invenio query into a ES query. + + :param handler: [function] take a query[string] parameter + """ + self.process_query = handler + + def set_results_handler(self, handler): + """ + Set a function to process the search results. + + To convert ES search results into an object understandable by Invenio. + + :param handler: [function] take a query[string] parameter + """ + self.process_results = handler + + @property + def status(self): + """The status of the ES cluster. + + See: http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/cluster-health.html + for more. + + TODO: is it usefull? + + :return: [string] possible values: green, yellow, red. green means all + ok including replication, yellow means replication not active, red + means partial results. + """ + return self.connection.health().get("status") + + def index_exists(self, index=None): + """Check if the index exists in the cluster. + + :param index: [string] index name + + :return: [bool] True if exists + """ + if index is None: + index = self.app.config['ELASTICSEARCH_INDEX'] + if self.connection.status().get("indices").get(index): + return True + return False + + def delete_index(self, index=None): + """Delete the given index. + + :param index: [string] index name + + :return: [bool] True if success + """ + if index is None: + index = self.app.config['ELASTICSEARCH_INDEX'] + try: + self.connection.delete_index(index=index) + return True + except: + return False + + def create_index(self, index=None): + """Create the given index. + + Also set basic configuration and doc type mappings. + + :param index: [string] index name + + :return: [bool] True if success + """ + if index is None: + index = self.app.config['ELASTICSEARCH_INDEX'] + if self.index_exists(index=index): + return True + try: + #create index + index_settings = { + #should be set to 1 for exact facet count + "number_of_shards": + self.app.config['ELASTICSEARCH_NUMBER_OF_SHARDS'], + + #in case of primary shard failed + "number_of_replicas": + self.app.config['ELASTICSEARCH_NUMBER_OF_REPLICAS'], + + #disable automatic type detection + #that can cause errors depending of the indexing order + "date_detection": + self.app.config['ELASTICSEARCH_DATE_DETECTION'], + "numeric_detection": + self.app.config['ELASTICSEARCH_NUMERIC_DETECTION'] + } + if self.app.config['ELASTICSEARCH_ANALYSIS']: + index_settings["analysis"] = \ + self.app.config['ELASTICSEARCH_ANALYSIS'] + + self.connection.create_index(index=index, settings=index_settings) + + from es_config import get_records_fields_config, \ + get_documents_fields_config, \ + get_collections_fields_config + #mappings + self._mapping(index=index, doc_type=self.records_doc_type, + fields_mapping=get_records_fields_config()) + + self._mapping(index=index, doc_type=self.documents_doc_type, + fields_mapping=get_documents_fields_config(), + parent_type=self.records_doc_type) + + self._mapping(index=index, doc_type=self.collections_doc_type, + fields_mapping=get_collections_fields_config(), + parent_type=self.records_doc_type) + + return True + except: + return False + + def _mapping(self, index, doc_type, fields_mapping, parent_type=None): + mapping = { + doc_type: { + "properties": fields_mapping + } + } + + # specific conf for join like query + if parent_type: + mapping[doc_type]["_parent"] = {"type": parent_type} + try: + self.connection.put_mapping(index=index, doc_type=doc_type, + mapping=mapping) + return True + except: + return False + + def _bulk_index_docs(self, docs, doc_type, index): + if not docs: + return [] + self.app.logger.info("Indexing: %d records for %s" % (len(docs), + doc_type)) + results = self.connection.bulk_index(index=index, + doc_type=doc_type, docs=docs, + id_field='_id', + refresh=self.app.config.get("DEBUG")) + errors = [] + for it in results.get("items"): + if it.get("index").get("error"): + errors.append((it.get("index").get("_id"), it.get("index").get("error"))) + return errors + + def _documents_has_been_updated(self, recid): + from invenio.legacy.bibdocfile.api import BibRecDocs + import datetime + + bibdocs = BibRecDocs(recid) + #TODO: replace legacy code + from invenio.legacy.dbquery import run_sql + (record_creation_date, record_modification_date) = \ + run_sql("SELECT creation_date, modification_date from bibrec where id=%s" + % (recid))[0] + + #wait for a JsonAlchemy bug resolution + #record = self._get_record(recid) + + #record_modification_date = \ + # datetime.datetime.strptime(record.get("modification_date"), + # "%Y-%m-%dT%H:%M:%S") + #record_creation_date = \ + # datetime.datetime.strptime(record.get("creation_date"), + # "%Y-%m-%dT%H:%M:%S.%f") + if not bibdocs.list_bibdocs(): + self.app.logger.debug("No docs for: %s" % recid) + for b in bibdocs.list_bibdocs(): + #should add fews seconds for rounding problem + if b.md + datetime.timedelta(seconds=2) >= record_modification_date: + return True + return False + + def _get_record(self, recid): + from invenio.modules.records.api import get_record + record_as_dict = get_record(recid, reset_cache=True).dumps() + del record_as_dict["__meta_metadata__"] + return record_as_dict + + def _get_text(self, recid): + from invenio.legacy.bibdocfile.api import BibRecDocs + text = BibRecDocs(recid).get_text(True) + if not text: + self.app.logger.debug("No text for:%s" % recid) + return None + return { + "fulltext": text, + "recid": recid, + "_id": recid, + "_parent": recid} + + def _get_collections(self, recid): + return { + "recid": recid, + "_id": recid, + "_parent": recid, + "name": self._recids_collections.get(recid, "")} + + def get_all_collections_for_records(self, recreate_cache_if_needed=True): + """Return a dict with recid as key and collection list as value. + + This replace existing Invenio function for performance reason. + + :param recreate_cache_if_needed: [bool] True if regenerate the cache + """ + from invenio.legacy.search_engine import collection_reclist_cache, get_collection_reclist + from invenio.legacy.websearch.webcoll import Collection + ret = {} + + #update the cache? + if recreate_cache_if_needed: + collection_reclist_cache.recreate_cache_if_needed() + + for name in collection_reclist_cache.cache.keys(): + recids = get_collection_reclist(name, recreate_cache_if_needed=False) + full_path_name = "/".join([v.name for v in + Collection(name).get_ancestors()] + + [name]) + for recid in recids: + ret.setdefault(recid, []).append(full_path_name) + self._recids_collections = ret + + def index_collections(self, recids=None, index=None, bulk_size=100000, **kwargs): + """Index collections. + + Collections maps computed by webcoll is indexed into the given index in + order to allow filtering by collection. + + :param recids: [list of int] recids to index + :param index: [string] index name + :param bulk_size: [int] batch size to index + + :return: [list of int] list of recids not indexed due to errors + """ + self.get_all_collections_for_records() + if not recids: + recids = self._recids_collections.keys() + if index is None: + index = self.app.config['ELASTICSEARCH_INDEX'] + return self._index_docs(recids, self.collections_doc_type, index, + bulk_size, self._get_collections) + + def index_documents(self, recids, index=None, bulk_size=100000, **kwargs): + """Index fulltext files. + + Put the fullext extracted by Invenio into the given index. + + :param recids: [list of int] recids to index + :param index: [string] index name + :param bulk_size: [int] batch size to index + + :return: [list of int] list of recids not indexed due to errors + """ + if index is None: + index = self.app.config['ELASTICSEARCH_INDEX'] + recids_to_index = filter(self._documents_has_been_updated, recids) + if recids_to_index: + self.app.logger.debug("Indexing document for %s" % recids) + return self._index_docs(recids_to_index, self.documents_doc_type, index, + bulk_size, self._get_text) + + def index_records(self, recids, index=None, bulk_size=100000, **kwargs): + """Index bibliographic records. + + The document structure is provided by JsonAlchemy. + + Note: the __metadata__ is removed for the moment. + + TODO: is should be renamed as index? + + :param recids: [list of int] recids to index + :param index: [string] index name + :param bulk_size: [int] batch size to index + + :return: [list] list of recids not indexed due to errors + """ + if index is None: + index = self.app.config['ELASTICSEARCH_INDEX'] + return self._index_docs(recids, self.records_doc_type, index, + bulk_size, self._get_record) + + def _index_docs(self, recids, doc_type, index, bulk_size, get_docs): + docs = [] + errors = [] + for recid in recids: + doc = get_docs(recid) + if doc: + docs.append(doc) + if len(docs) >= bulk_size: + errors += self._bulk_index_docs(docs, doc_type=doc_type, + index=index) + docs = [] + errors += self._bulk_index_docs(docs, doc_type=doc_type, index=index) + return errors + + def find_similar(self, recid, index=None, **kwargs): + """Find simlar documents to the given recid. + + TODO: tests + + :param recid: [int] document id to find similar + :param index: [string] index name + + :return: [list] list of recids + """ + if index is None: + index = self.app.config['ELASTICSEARCH_INDEX'] + fields_to_compute_similarity = ["_all"] + return self.connection.more_like_this(index=index, + doc_type=self.records_doc_type, + id=recid, + mlt_fields=fields_to_compute_similarity) + + def _search_records(self, query=None, index=None, main_options={}, sort_options={}, + facets={}, highlights={}, facet_filters=[]): + """Perform search on records. + + It compute: + - hits + - metadata facets + - metadata highlights + + :param query: [nested dict] ES query derived from Invenio query + :param index: [string] index name + :param main_options: [nested dict] ES main options such as paging + :param sort_options: [nested dict] ES sort options + :param facets: [nested dict] ES facets configuration + :param highlights: [nested dict] ES highlight configuration + :param facet_filters: [nested dict] ES facet filters, i.e. when the + user click on the facet term + + Here is the basic form of the query: + { + #here search options such as paging + "query": { + "filtered" : { + "query": { + "bool": { + #this is valid for a pure textual query, will + #becomes a more complexe query with a Invenio to ES + #converter see: + #http://pythonhosted.org/Whoosh/parsing.html for + #create_create_basic_search_unit + #inspiration + "should": [{ + #records query + }, + { + #fulltext is on query part as is a part of ranking + "has_child": { + "type": "documents", + "query": {#children query} + } + }], + # or condition + "minimum_should_match" : 1 + } + } + }, + "filter": { + "bool": { + "must": [ + #facet filters including collection filters using has_child + ] + } + } + }, + "sort": {#sort options}, + "highlights": {#hightlight options}, + "facets": {#facets options without facet_filter as it is done on the query part} + + } + """ + if not query: + query = { + "query": { + "match_all": {} + } + } + es_query = {} + es_query.update(main_options) + if facet_filters: + es_query.update({ + "query": { + "filtered": { + "query": query.get("query"), + "filter": { + "bool": { + "must": facet_filters + } + } + } + } + }) + else: + es_query.update(query) + es_query.update(sort_options) + es_query.update(facets) + es_query.update(highlights) + results = self.process_results(self.connection.search(es_query, + index=index, + doc_type=self.records_doc_type)) + return (results, es_query) + + def _search_documents(self, query, index, filtered_ids): + """Preform a search query to extract hilights from documents. + + :param query: [nested dict] ES query derived from Invenio query + :param index: [string] index name + :param filtered_ids: [list of int] list of record ids return by the + records search query + + :return: [object] response + + Here is the basic form of the query: + { + "size": 10, + "fields": [], + "query": { + "filtered": { + "query": {#fulltext query similar than records query with has_child -> has_parent}, + "filter": { + "ids": { + "values": #ids returned by records search + } + } + } + }, + "highlight": {#fulltext highlights config} + } + """ + from es_config import get_documents_highlights_config + documents_query = { + "size": 10, + "fields": [], + "query": { + "filtered": { + "query": { + "query_string": { + "default_field": "fulltext", + "query": query}}, + "filter": { + "ids": { + "values": filtered_ids + } + } + } + }, + "highlight": get_documents_highlights_config() + } + + #submit query for fulltext highlighting + return self.process_results(self.connection.search(documents_query, + index=index, + doc_type=self.documents_doc_type)) + + def _search_collections(self, records_query, index, include_collections): + """Perform a search query to extract hilights from documents. + + :param records_query: [nested dict] query used to search into records + :param index: [string] index name + :param exclude_collections: [list of strings] collection name to exclude to facets + + :return: [object] response + + Here is the basic form of the query: + { + #no results only facets/aggregators computation + "size": 0, + "query": { + "has_parent": { + "parent_type" : "records", + "query": #same as records query + }, + }, + "aggs": { + "collections": { + "terms" : { + "field" : "name", + #options + "min_doc_count": 1, + "order" : { "_term" : "asc" }, + } + } + } + } + """ + collections_query = { + "size": 0, + "query": { + "has_parent": { + "parent_type": "records", + "query": records_query.get("query") + } + }, + "aggs": { + "collections": { + "terms": { + "field": "name", + "min_doc_count": 1, + "order": {"_term": "asc"} + } + } + } + } + + return self.process_results(self.connection.search(collections_query, + index=index, + doc_type=self.collections_doc_type)) + + def search(self, query, index=None, cc=None, f="", rg=None, + sf=None, so="d", jrec=0, facet_filters=[], **kwargs): + """Perform a search query. + + Note: a lot of work to do. + + :param query: [string] search query + :param recids: [list of int] recids to index + :param index: [string] index name + :param cc: [string] main collection name + :param f: [string] field to search (not yet used) + :param rg: [int] number of results to return + :param sf: [string] sort field + :param so: [string] sort order in [d,a] + :param jrec: [int] result offset for paging + :param facet_filters: [list of tupple of strings] filters to prune the + results. Each filter is defined as a tupple of term, value: (i.e. + [("facet_authors", "Ellis, J.")]) + + :return: [object] response + """ + if index is None: + index = self.app.config['ELASTICSEARCH_INDEX'] + + if cc is None: + cc = self.app.config['CFG_SITE_NAME'] + + if rg is None: + rg = int(self.app.config['CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS']) + #converted Invenio query + es_query = self.process_query(query) + + #search main options + main_options = { + "size": rg, + "from": jrec, + "fields": [] + } + + # sorting + sort_options = {} + if sf: + sort_options = { + "sort": [{ + "sort_%s" % sf: { + "order": "desc" if so == "d" else "asc" + } + }] + } + + # facet_filters + include_collection = [] + #es_filters = [] + es_filters = [{ + "has_child": { + "type": self.collections_doc_type, + "filter": { + "term": { + "name": { + "value": cc + } + } + } + } + }] + for ft in facet_filters: + (term, value) = ft + if term == "facet_collections": + include_collection.append(value) + es_filters.append({ + "has_child": { + "type": self.collections_doc_type, + "filter": { + "term": { + "name": { + "value": value + } + } + } + } + }) + else: + es_filters.append({ + "term": { + term: value + } + }) + if not include_collection: + include_collection = [cc] + + # facet configuration + from es_config import get_records_facets_config + facets = { + "aggs": get_records_facets_config() + } + + # hightlight configuration + from es_config import get_records_highlights_config + highlights = { + "highlight": get_records_highlights_config() + } + + (results, records_query) = self._search_records(query=es_query, index=index, + main_options=main_options, + sort_options=sort_options, + facets=facets, + highlights=highlights, + facet_filters=es_filters) + + #build query for fulltext highlighting + matched_ids = [recid for recid in results.hits] + hi_results = self._search_documents(query=query, index=index, filtered_ids=matched_ids) + + #merge with existing metadata highlights + for recid, hi in hi_results.highlights.iteritems(): + results.highlights.data[int(recid)].update(hi) + + #compute facets for collections + cols_results = self._search_collections(records_query=records_query, + index=index, + include_collections=include_collection) + results.facets.update(cols_results.facets) + + return results + + +def index_record(sender, recid): + """ + Index a given record. + + Used to connect to signal. + + :param recid: [int] recid to index + """ + from .tasks import index_records + return index_records.delay(sender, recid) + + +def index_collections(sender, collections): + """ + Index a given ghe collection. + + Used to connect to signal. + + Note: all collections are indexed as it is fast. + :param collections: [list of string] collection names + """ + from .tasks import index_collections + return index_collections.delay(sender, []) + + +def drop_index(sender, yes_i_know, default_data=None): + """ + Remove the elasticsearch index. + + Used to connect to signal. + """ + from flask import current_app + es = current_app.extensions.get("elasticsearch") + es.delete_index() + + +def create_index(sender, yes_i_know=False, default_data=None): + """ + Create the elasticsearch index. + + Index creation, settings and mapping. + + Used to connect to signal. + """ + from flask import current_app + es = current_app.extensions.get("elasticsearch") + es.delete_index() + es.create_index() + + +def setup_app(app): + """Set up the extension for the given app.""" + from es_query import process_es_query, process_es_results + es = ElasticSearch(app) + es.set_query_handler(process_es_query) + es.set_results_handler(process_es_results) + + app.extensions["registry"]["packages"].register("invenio.ext.elasticsearch") + from invenio.base import signals + signals.record_after_update.connect(index_record) + signals.record_after_create.connect(index_record) + signals.webcoll_after_reclist_cache_update.connect(index_collections) + from invenio.base.scripts.database import recreate, drop, create + signals.pre_command.connect(drop_index, sender=drop) + signals.post_command.connect(create_index, sender=create) + signals.pre_command.connect(drop_index, sender=recreate) + signals.post_command.connect(create_index, sender=recreate) diff --git a/invenio/ext/elasticsearch/config/__init__.py b/invenio/ext/elasticsearch/config/__init__.py new file mode 100644 index 000000000..4344eaf1a --- /dev/null +++ b/invenio/ext/elasticsearch/config/__init__.py @@ -0,0 +1,19 @@ +## +## This file is part of Invenio. +## Copyright (C) 2013, 2014 CERN. +## +## Invenio is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 2 of the +## License, or (at your option) any later version. +## +## Invenio is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Invenio; if not, write to the Free Software Foundation, Inc., +## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +"""Config part of elasticsearch Invenio extension.""" diff --git a/invenio/ext/elasticsearch/config/elasticsearch.yml b/invenio/ext/elasticsearch/config/elasticsearch.yml new file mode 100644 index 000000000..9ba9b1a7e --- /dev/null +++ b/invenio/ext/elasticsearch/config/elasticsearch.yml @@ -0,0 +1,370 @@ +#################### ElasticSearch Configuration Example ##################### + +# This file contains an overview of various configuration settings, +# targeted at operations staff. Application developers should +# consult the guide at <http://elasticsearch.org/guide>. +# +# The installation procedure is covered at +# <http://elasticsearch.org/guide/en/elasticsearch/reference/current/setup.html>. +# +# ElasticSearch comes with reasonable defaults for most settings, +# so you can try it out without bothering with configuration. +# +# Most of the time, these defaults are just fine for running a production +# cluster. If you're fine-tuning your cluster, or wondering about the +# effect of certain configuration option, please _do ask_ on the +# mailing list or IRC channel [http://elasticsearch.org/community]. + +# Any element in the configuration can be replaced with environment variables +# by placing them in ${...} notation. For example: +# +# node.rack: ${RACK_ENV_VAR} + +# For information on supported formats and syntax for the config file, see +# <http://elasticsearch.org/guide/en/elasticsearch/reference/current/setup-configuration.html> + + +################################### Cluster ################################### + +# Cluster name identifies your cluster for auto-discovery. If you're running +# multiple clusters on the same network, make sure you're using unique names. +# +# cluster.name: elasticsearch +cluster.name: invenio + + +#################################### Node ##################################### + +# Node names are generated dynamically on startup, so you're relieved +# from configuring them manually. You can tie this node to a specific name: +# +# node.name: "Franz Kafka" +node.name: "node1" + +# Every node can be configured to allow or deny being eligible as the master, +# and to allow or deny to store the data. +# +# Allow this node to be eligible as a master node (enabled by default): +# +# node.master: true +# +# Allow this node to store data (enabled by default): +# +# node.data: true + +# You can exploit these settings to design advanced cluster topologies. +# +# 1. You want this node to never become a master node, only to hold data. +# This will be the "workhorse" of your cluster. +# +# node.master: false +# node.data: true +# +# 2. You want this node to only serve as a master: to not store any data and +# to have free resources. This will be the "coordinator" of your cluster. +# +# node.master: true +# node.data: false +# +# 3. You want this node to be neither master nor data node, but +# to act as a "search load balancer" (fetching data from nodes, +# aggregating results, etc.) +# +# node.master: false +# node.data: false + +# Use the Cluster Health API [http://localhost:9200/_cluster/health], the +# Node Info API [http://localhost:9200/_cluster/nodes] or GUI tools +# such as <http://github.com/lukas-vlcek/bigdesk> and +# <http://mobz.github.com/elasticsearch-head> to inspect the cluster state. + +# A node can have generic attributes associated with it, which can later be used +# for customized shard allocation filtering, or allocation awareness. An attribute +# is a simple key value pair, similar to node.key: value, here is an example: +# +# node.rack: rack314 + +# By default, multiple nodes are allowed to start from the same installation location +# to disable it, set the following: +# node.max_local_storage_nodes: 1 + + +#################################### Index #################################### + +# You can set a number of options (such as shard/replica options, mapping +# or analyzer definitions, translog settings, ...) for indices globally, +# in this file. +# +# Note, that it makes more sense to configure index settings specifically for +# a certain index, either when creating it or by using the index templates API. +# +# See <http://elasticsearch.org/guide/en/elasticsearch/reference/current/index-modules.html> and +# <http://elasticsearch.org/guide/en/elasticsearch/reference/current/indices-create-index.html> +# for more information. + +# Set the number of shards (splits) of an index (5 by default): +# +# index.number_of_shards: 5 + +# Set the number of replicas (additional copies) of an index (1 by default): +# +# index.number_of_replicas: 1 + +# Note, that for development on a local machine, with small indices, it usually +# makes sense to "disable" the distributed features: +# +# index.number_of_shards: 1 +# index.number_of_replicas: 0 + +# These settings directly affect the performance of index and search operations +# in your cluster. Assuming you have enough machines to hold shards and +# replicas, the rule of thumb is: +# +# 1. Having more *shards* enhances the _indexing_ performance and allows to +# _distribute_ a big index across machines. +# 2. Having more *replicas* enhances the _search_ performance and improves the +# cluster _availability_. +# +# The "number_of_shards" is a one-time setting for an index. +# +# The "number_of_replicas" can be increased or decreased anytime, +# by using the Index Update Settings API. +# +# ElasticSearch takes care about load balancing, relocating, gathering the +# results from nodes, etc. Experiment with different settings to fine-tune +# your setup. + +# Use the Index Status API (<http://localhost:9200/A/_status>) to inspect +# the index status. + + +#################################### Paths #################################### + +# Path to directory containing configuration (this file and logging.yml): +# +# path.conf: /path/to/conf + +# Path to directory where to store index data allocated for this node. +# +# path.data: /path/to/data +# +# Can optionally include more than one location, causing data to be striped across +# the locations (a la RAID 0) on a file level, favouring locations with most free +# space on creation. For example: +# +# path.data: /path/to/data1,/path/to/data2 + +# Path to temporary files: +# +# path.work: /path/to/work + +# Path to log files: +# +# path.logs: /path/to/logs + +# Path to where plugins are installed: +# +# path.plugins: /path/to/plugins + + +#################################### Plugin ################################### + +# If a plugin listed here is not installed for current node, the node will not start. +# +# plugin.mandatory: mapper-attachments,lang-groovy + + +################################### Memory #################################### + +# ElasticSearch performs poorly when JVM starts swapping: you should ensure that +# it _never_ swaps. +# +# Set this property to true to lock the memory: +# +# bootstrap.mlockall: true +bootstrap.mlockall: true + +# Make sure that the ES_MIN_MEM and ES_MAX_MEM environment variables are set +# to the same value, and that the machine has enough memory to allocate +# for ElasticSearch, leaving enough memory for the operating system itself. +# +# You should also make sure that the ElasticSearch process is allowed to lock +# the memory, eg. by using `ulimit -l unlimited`. + + +############################## Network And HTTP ############################### + +# ElasticSearch, by default, binds itself to the 0.0.0.0 address, and listens +# on port [9200-9300] for HTTP traffic and on port [9300-9400] for node-to-node +# communication. (the range means that if the port is busy, it will automatically +# try the next port). + +# Set the bind address specifically (IPv4 or IPv6): +# +# network.bind_host: 192.168.0.1 + +# Set the address other nodes will use to communicate with this node. If not +# set, it is automatically derived. It must point to an actual IP address. +# +# network.publish_host: 192.168.0.1 + +# Set both 'bind_host' and 'publish_host': +# +# network.host: 192.168.0.1 + +# Set a custom port for the node to node communication (9300 by default): +# +# transport.tcp.port: 9300 +transport.tcp.port: 9300 + +# Enable compression for all communication between nodes (disabled by default): +# +# transport.tcp.compress: true + +# Set a custom port to listen for HTTP traffic: +# +# http.port: 9200 +http.port: 9200 + +# Set a custom allowed content length: +# +# http.max_content_length: 100mb + +# Disable HTTP completely: +# +# http.enabled: false + + +################################### Gateway ################################### + +# The gateway allows for persisting the cluster state between full cluster +# restarts. Every change to the state (such as adding an index) will be stored +# in the gateway, and when the cluster starts up for the first time, +# it will read its state from the gateway. + +# There are several types of gateway implementations. For more information, see +# <http://elasticsearch.org/guide/en/elasticsearch/reference/current/modules-gateway.html>. + +# The default gateway type is the "local" gateway (recommended): +# +# gateway.type: local + +# Settings below control how and when to start the initial recovery process on +# a full cluster restart (to reuse as much local data as possible when using shared +# gateway). + +# Allow recovery process after N nodes in a cluster are up: +# +# gateway.recover_after_nodes: 1 + +# Set the timeout to initiate the recovery process, once the N nodes +# from previous setting are up (accepts time value): +# +# gateway.recover_after_time: 5m + +# Set how many nodes are expected in this cluster. Once these N nodes +# are up (and recover_after_nodes is met), begin recovery process immediately +# (without waiting for recover_after_time to expire): +# +# gateway.expected_nodes: 2 + + +############################# Recovery Throttling ############################# + +# These settings allow to control the process of shards allocation between +# nodes during initial recovery, replica allocation, rebalancing, +# or when adding and removing nodes. + +# Set the number of concurrent recoveries happening on a node: +# +# 1. During the initial recovery +# +# cluster.routing.allocation.node_initial_primaries_recoveries: 4 +# +# 2. During adding/removing nodes, rebalancing, etc +# +# cluster.routing.allocation.node_concurrent_recoveries: 2 + +# Set to throttle throughput when recovering (eg. 100mb, by default 20mb): +# +# indices.recovery.max_bytes_per_sec: 20mb + +# Set to limit the number of open concurrent streams when +# recovering a shard from a peer: +# +# indices.recovery.concurrent_streams: 5 + + +################################## Discovery ################################## + +# Discovery infrastructure ensures nodes can be found within a cluster +# and master node is elected. Multicast discovery is the default. + +# Set to ensure a node sees N other master eligible nodes to be considered +# operational within the cluster. Its recommended to set it to a higher value +# than 1 when running more than 2 nodes in the cluster. +# +# discovery.zen.minimum_master_nodes: 1 + +# Set the time to wait for ping responses from other nodes when discovering. +# Set this option to a higher value on a slow or congested network +# to minimize discovery failures: +# +# discovery.zen.ping.timeout: 3s + +# For more information, see +# <http://elasticsearch.org/guide/en/elasticsearch/reference/current/modules-discovery-zen.html> + +# Unicast discovery allows to explicitly control which nodes will be used +# to discover the cluster. It can be used when multicast is not present, +# or to restrict the cluster communication-wise. +# +# 1. Disable multicast discovery (enabled by default): +# +# discovery.zen.ping.multicast.enabled: false +discovery.zen.ping.multicast.enabled: false +# +# 2. Configure an initial list of master nodes in the cluster +# to perform discovery when new nodes (master or data) are started: +# +# discovery.zen.ping.unicast.hosts: ["host1", "host2:port"] + +# EC2 discovery allows to use AWS EC2 API in order to perform discovery. +# +# You have to install the cloud-aws plugin for enabling the EC2 discovery. +# +# For more information, see +# <http://elasticsearch.org/guide/en/elasticsearch/reference/current/modules-discovery-ec2.html> +# +# See <http://elasticsearch.org/tutorials/elasticsearch-on-ec2/> +# for a step-by-step tutorial. + + +################################## Slow Log ################################## + +# Shard level query and fetch threshold logging. + +#index.search.slowlog.threshold.query.warn: 10s +#index.search.slowlog.threshold.query.info: 5s +#index.search.slowlog.threshold.query.debug: 2s +#index.search.slowlog.threshold.query.trace: 500ms + +#index.search.slowlog.threshold.fetch.warn: 1s +#index.search.slowlog.threshold.fetch.info: 800ms +#index.search.slowlog.threshold.fetch.debug: 500ms +#index.search.slowlog.threshold.fetch.trace: 200ms + +#index.indexing.slowlog.threshold.index.warn: 10s +#index.indexing.slowlog.threshold.index.info: 5s +#index.indexing.slowlog.threshold.index.debug: 2s +#index.indexing.slowlog.threshold.index.trace: 500ms + +################################## GC Logging ################################ + +#monitor.jvm.gc.young.warn: 1000ms +#monitor.jvm.gc.young.info: 700ms +#monitor.jvm.gc.young.debug: 400ms + +#monitor.jvm.gc.old.warn: 10s +#monitor.jvm.gc.old.info: 5s +#monitor.jvm.gc.old.debug: 2s diff --git a/invenio/ext/elasticsearch/config/invenio_stop.txt b/invenio/ext/elasticsearch/config/invenio_stop.txt new file mode 100644 index 000000000..9160c449e --- /dev/null +++ b/invenio/ext/elasticsearch/config/invenio_stop.txt @@ -0,0 +1,783 @@ +and +will +their +then +there +these +that +they +or +as +a +but +in +for +the +of +no +into +be +with +not +by +on +at +was +to +such +it +if +is +are +this + +à +â +abord +afin +ah +ai +aie +ainsi +allaient +allo +allô +allons +après +assez +attendu +au +aucun +aucune +aujourd +aujourd'hui +auquel +aura +auront +aussi +autre +autres +aux +auxquelles +auxquels +avaient +avais +avait +avant +avec +avoir +ayant +b +bah +beaucoup +bien +bigre +boum +bravo +brrr +c +ça +car +ce +ceci +cela +celle +celle-ci +celle-là +celles +celles-ci +celles-là +celui +celui-ci +celui-là +cent +cependant +certain +certaine +certaines +certains +certes +ces +cet +cette +ceux +ceux-ci +ceux-là +chacun +chaque +cher +chère +chères +chers +chez +chiche +chut +ci +cinq +cinquantaine +cinquante +cinquantième +cinquième +clac +clic +combien +comme +comment +compris +concernant +contre +couic +crac +d +da +dans +de +debout +dedans +dehors +delà +depuis +derrière +des +dès +désormais +desquelles +desquels +dessous +dessus +deux +deuxième +deuxièmement +devant +devers +devra +différent +différente +différentes +différents +dire +divers +diverse +diverses +dix +dix-huit +dixième +dix-neuf +dix-sept +doit +doivent +donc +dont +douze +douzième +dring +du +duquel +durant +e +effet +eh +elle +elle-même +elles +elles-mêmes +en +encore +entre +envers +environ +es +ès +est +et +etant +étaient +étais +était +étant +etc +été +etre +être +eu +euh +eux +eux-mêmes +excepté +f +façon +fais +faisaient +faisant +fait +feront +fi +flac +floc +font +g +gens +h +ha +hé +hein +hélas +hem +hep +hi +ho +holà +hop +hormis +hors +hou +houp +hue +hui +huit +huitième +hum +hurrah +i +il +ils +importe +j +je +jusqu +jusque +k +l +la +là +laquelle +las +le +lequel +les +lès +lesquelles +lesquels +leur +leurs +longtemps +lorsque +lui +lui-même +m +ma +maint +mais +malgré +me +même +mêmes +merci +mes +mien +mienne +miennes +miens +mille +mince +moi +moi-même +moins +mon +moyennant +n +na +ne +néanmoins +neuf +neuvième +ni +nombreuses +nombreux +non +nos +notre +nôtre +nôtres +nous +nous-mêmes +nul +o +o| +ô +oh +ohé +olé +ollé +on +ont +onze +onzième +ore +ou +où +ouf +ouias +oust +ouste +outre +p +paf +pan +par +parmi +partant +particulier +particulière +particulièrement +pas +passé +pendant +personne +peu +peut +peuvent +peux +pff +pfft +pfut +pif +plein +plouf +plus +plusieurs +plutôt +pouah +pour +pourquoi +premier +première +premièrement +près +proche +psitt +puisque +q +qu +quand +quant +quanta +quant-à-soi +quarante +quatorze +quatre +quatre-vingt +quatrième +quatrièmement +que +quel +quelconque +quelle +quelles +quelque +quelques +quelqu'un +quels +qui +quiconque +quinze +quoi +quoique +r +revoici +revoilà +rien +s +sa +sacrebleu +sans +sapristi +sauf +se +seize +selon +sept +septième +sera +seront +ses +si +sien +sienne +siennes +siens +sinon +six +sixième +soi +soi-même +soit +soixante +son +sont +sous +stop +suis +suivant +sur +surtout +t +ta +tac +tant +te +té +tel +telle +tellement +telles +tels +tenant +tes +tic +tien +tienne +tiennes +tiens +toc +toi +toi-même +ton +touchant +toujours +tous +tout +toute +toutes +treize +trente +très +trois +troisième +troisièmement +trop +tsoin +tsouin +tu +u +un +une +unes +uns +v +va +vais +vas +vé +vers +via +vif +vifs +vingt +vivat +vive +vives +vlan +voici +voilà +vont +vos +votre +vôtre +vôtres +vous +vous-mêmes +vu +w +x +y +z +zut +aber +als +also +am +an +auch +auf +aus +bei +beim +bis +da +damit +dann +das +daß +dazu +dem +den +denn +der +des +die +dir +doch +dort +durch +ein +eine +einem +einen +einer +eines +euch +für +gegen +im +in +ins +ist +mit +nach +ob +oder +ohne +so +über +um +und +unter +vom +von +vor +zu +zum +zur +a +adesso +ai +al +alla +allo +allora +altre +altri +altro +anche +ancora +avere +aveva +avevano +ben +buono +che +chi +cinque +comprare +con +consecutivi +consecutivo +cosa +cui +da +del +della +dello +dentro +deve +devo +di +doppio +due +e +ecco +fare +fine +fino +fra +gente +giu +ha +hai +hanno +ho +il +indietro +invece +io +la +lavoro +le +lei +lo +loro +lui +lungo +ma +me +meglio +molta +molti +molto +nei +nella +no +noi +nome +nostro +nove +nuovi +nuovo +o +oltre +ora +otto +peggio +pero +persone +piu +poco +primo +promesso +qua +quarto +quasi +quattro +quello +questo +qui +quindi +quinto +rispetto +sara +secondo +sei +sembra +sembrava +senza +sette +sia +siamo +siete +solo +sono +sopra +soprattutto +sotto +stati +stato +stesso +su +subito +sul +sulla +tanto +te +tempo +terzo +tra +tre +triplo +ultimo +un +una +uno +va +vai +voi +volte +vostro +de +la +que +el +en +y +a +los +del +se +las +por +un +para +con +no +una +su +al +es +lo +como +más +pero +sus +le +ya +o +fue +este +ha +sé +porque +esta +son +está +cuando +muy +sin +sobre +ser +tiene +también +me +hasta +hay +donde +han +quien +están +estado +desde +todo +nos +durante +estados +todos +uno +les +ni +contra +otros +fueron +ese +eso +había +ante +ellos +e +esto +mí +antes +algunos +qué +unos +yo +otro +otras +otra +tanto +esa +estos +mucho +quienes +nada +muchos +cual +sea +poco +ella +estar +haber +estas +estaba +estamos +algunas +algo +nosotros diff --git a/invenio/ext/elasticsearch/es_config.py b/invenio/ext/elasticsearch/es_config.py new file mode 100644 index 000000000..a5eff521d --- /dev/null +++ b/invenio/ext/elasticsearch/es_config.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- +## +## This file is part of Invenio. +## Copyright (C) 2014 CERN. +## +## Invenio is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 2 of the +## License, or (at your option) any later version. +## +## Invenio is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Invenio; if not, write to the Free Software Foundation, Inc., +## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +"""General config file for ES index.""" + + +################ Fields ############### + +def get_collections_fields_config(): + """Mapping for collections.""" + mapping = { + "recid": {"type": "integer"}, + "name": { + "type": "string", + "analyzer": "keyword"}} + + return mapping + + +def get_documents_fields_config(): + """Mapping for documents.""" + mapping = { + #force recid type to integer for default sorting + "recid": {"type": "integer"}, + "fulltext": {"type": "string"}, + } + return mapping + + +def get_records_fields_config(): + """Mapping for records.""" + from invenio.modules.jsonalchemy.parser import FieldParser + fields = FieldParser.field_definitions('recordext') + mapping = {} + for name, value in fields.iteritems(): + current_mapping = value.get("elasticsearch", {}).get("mapping") + if current_mapping: + mapping.update(current_mapping) + return mapping + + +################ Highlights ############### + +HIGHLIGHTS_BASE_CONFIG = { + "number_of_fragments": 3, + "fragment_size": 70 +} + + +def get_records_highlights_config(): + """Get hilights config for records.""" + from invenio.modules.jsonalchemy.parser import FieldParser + fields = FieldParser.field_definitions('recordext') + highlights = {} + for name, value in fields.iteritems(): + current_highlights = value.get("elasticsearch", {}).get("highlights") + if current_highlights: + highlights.update(current_highlights) + config = { + "fields": highlights + } + return config + + +def get_documents_highlights_config(): + """Get hilights config for fulltext document.""" + config = { + "fields": { + "fulltext": HIGHLIGHTS_BASE_CONFIG + } + } + return config + + +################ Facets ############### + +def get_records_facets_config(): + """Get facets config for records.""" + from invenio.modules.jsonalchemy.parser import FieldParser + fields = FieldParser.field_definitions('recordext') + facets = {} + for name, value in fields.iteritems(): + current_facet = value.get("elasticsearch", {}).get("facets") + if current_facet: + facets.update(current_facet) + return facets diff --git a/invenio/ext/elasticsearch/es_query.py b/invenio/ext/elasticsearch/es_query.py new file mode 100644 index 000000000..97b61efdf --- /dev/null +++ b/invenio/ext/elasticsearch/es_query.py @@ -0,0 +1,140 @@ +# -*- coding: utf-8 -*- +## +## This file is part of Invenio. +## Copyright (C) 2014 CERN. +## +## Invenio is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 2 of the +## License, or (at your option) any later version. +## +## Invenio is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Invenio; if not, write to the Free Software Foundation, Inc., +## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +"""A warpper between Invenio and elasticsearch. + +invenio.ext.elasticsearch.es_query +---------------------------------- + +A warpper between Invenio and elasticsearch. + +usage: + >>> from es_query import process_es_query, process_es_results + >>> es = ElasticSearch(app) + >>> es.query_handler(process_es_query) + >>> es.results_handler(process_es_results) + +""" +from UserDict import UserDict + + +def process_es_query(query): + """Convert an Invenio query into an ES query. + + :param query: [string] Invenio query + + :return: [dict] ES query + """ + es_query = { + "query": { + "bool": { + "should": [{ + "query_string": { + "query": query + } + }, + { + "has_child": { + "type": "documents", + "query": { + "query_string": { + "default_field": "fulltext", + "query": query + } + } + } + }], + "minimum_should_match": 1 + } + } + } + return es_query + + +def process_es_results(results): + """Convert a ES results into a Invenio search engine result. + + :param results: [object] elasticsearch results + + :return: [object] standard Invenio search engine results + """ + return Response(results) + + +class Response(object): + + """An Invenio response object. + + Contains, Hits, Facet results and Higlights. + """ + + def __init__(self, data): + """New Response instance.""" + self.data = data + self.hits = Hits(data) + self.facets = Facets(data) + self.highlights = Highlights(data) + + +class Hits(object): + + """Iterator over all recids that matched the query.""" + + def __init__(self, data): + """New Hits instance.""" + self.data = data.get("hits") + + def __iter__(self): + """Iteration over values. + + TODO: query with token if you ask for more then len(self) + """ + for hit in self.data['hits']: + yield int(hit['_id']) + + def __len__(self): + """Number of elements.""" + return self.data['total'] + + +class Facets(UserDict): + + """Facet response objects.""" + + def __init__(self, data): + """New Facets instance.""" + UserDict.__init__(self, data.get("aggregations")) + + +class Highlights(UserDict): + + """Hightlights response objects.""" + + def __init__(self, data): + """New Hightlights instance. + + TODO: add fulltext highlights. + """ + new_data = {} + for hit in data.get('hits', {}).get('hits', []): + if hit.get("highlight"): + new_data[int(hit.get('_id'))] = hit.get("highlight") + else: + new_data[int(hit.get('_id'))] = {} + UserDict.__init__(self, new_data) diff --git a/invenio/ext/elasticsearch/jsonext/__init__.py b/invenio/ext/elasticsearch/jsonext/__init__.py new file mode 100644 index 000000000..d6b7525ea --- /dev/null +++ b/invenio/ext/elasticsearch/jsonext/__init__.py @@ -0,0 +1,19 @@ +## +## This file is part of Invenio. +## Copyright (C) 2013, 2014 CERN. +## +## Invenio is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 2 of the +## License, or (at your option) any later version. +## +## Invenio is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Invenio; if not, write to the Free Software Foundation, Inc., +## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +"""JsonAchemy parser for elasticsearch Invenio extension.""" diff --git a/invenio/ext/elasticsearch/jsonext/parsers/__init__.py b/invenio/ext/elasticsearch/jsonext/parsers/__init__.py new file mode 100644 index 000000000..d6b7525ea --- /dev/null +++ b/invenio/ext/elasticsearch/jsonext/parsers/__init__.py @@ -0,0 +1,19 @@ +## +## This file is part of Invenio. +## Copyright (C) 2013, 2014 CERN. +## +## Invenio is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 2 of the +## License, or (at your option) any later version. +## +## Invenio is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Invenio; if not, write to the Free Software Foundation, Inc., +## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +"""JsonAchemy parser for elasticsearch Invenio extension.""" diff --git a/invenio/ext/elasticsearch/jsonext/parsers/elasticsearch_parser.py b/invenio/ext/elasticsearch/jsonext/parsers/elasticsearch_parser.py new file mode 100644 index 000000000..9d960d146 --- /dev/null +++ b/invenio/ext/elasticsearch/jsonext/parsers/elasticsearch_parser.py @@ -0,0 +1,126 @@ +# -*- coding: utf-8 -*- +## +## This file is part of Invenio. +## Copyright (C) 2014 CERN. +## +## Invenio is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 2 of the +## License, or (at your option) any later version. +## +## Invenio is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Invenio; if not, write to the Free Software Foundation, Inc., +## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +"""JsonAchemy parser for elasticsearch Invenio extension.""" + +from pyparsing import Keyword, Suppress, Word, Optional, Each, \ + OneOrMore, alphanums, restOfLine + +from invenio.base.utils import try_to_eval +from invenio.modules.jsonalchemy.parser import FieldBaseExtensionParser, \ + indentedBlock, DICT_DEF +from invenio.modules.jsonalchemy.registry import functions + + +class ElasticSearchParser(FieldBaseExtensionParser): + + """ElasticSearch jsonalchemy extension.""" + + __parsername__ = 'elasticsearch' + + @classmethod + def parse_element(cls, indent_stack): + """Parse ``elasticsearch`` section. + + This is an example of the content of this section:: + @extend + title: + elasticsearch: + mapping: { + "properties": { + "title": { + "index_name": "title", + "type": "multi_field", + "fields": { + "title": { + "type": "string", + "analyzer": "standard" + }, + "sort_title": { + "type": "string", + "analyzer": "simple" + } + } + } + } + } + local_tokenizer: + title.title, invenio.ext.elasticsearch.token1 + title.subtitle, invenio.ext.elasticsearch.token2 + facets: { + "authors": { + "terms" : { + "field" : "facet_authors", + "size": 10, + "order" : { "_count" : "desc" } + } + } + } + highlights: { + "number_of_fragments" : 3, + "fragment_size" : 70 + } + + + """ + mapping = (Keyword('mapping:').suppress() + DICT_DEF)\ + .setResultsName('mapping')\ + .setParseAction(lambda toks: toks[0]) + tokenizer_field = Word(alphanums + '_' + '.') + local_tokenizer = (Keyword('local_tokenizer:').suppress() + + indentedBlock( + OneOrMore(tokenizer_field + + Suppress(',') + + restOfLine), indent_stack) + ).setResultsName('local_tokenizer') + facets = (Keyword('facets:').suppress() + DICT_DEF)\ + .setResultsName('facets')\ + .setParseAction(lambda toks: toks[0]) + + highlights = (Keyword('highlights:').suppress() + DICT_DEF)\ + .setResultsName('highlights')\ + .setParseAction(lambda toks: toks[0]) + + return (Keyword('elasticsearch:').suppress() + + indentedBlock( + Each([Optional(mapping), + Optional(local_tokenizer), + Optional(facets), + Optional(highlights)]), + indent_stack) + ).setResultsName('elasticsearch') + + @classmethod + def create_element(cls, rule, namespace): + """Create the dictionary with the info from the configuratin file.""" + element = dict() + element['mapping'] = try_to_eval( + rule.mapping, functions(namespace)) if rule.mapping else {} + element['facets'] = try_to_eval( + rule.facets, functions(namespace)) if rule.facets else {} + element['highlights'] = try_to_eval( + rule.highlights, functions(namespace)) if rule.highlights else {} + element['local_tokenizer'] = [] + if rule.local_tokenizer: + for i in range(0, len(rule.local_tokenizer), 2): + element['local_tokenizer'].append( + (rule.local_tokenizer[i], rule.local_tokenizer[i + 1])) + return element + +parser = ElasticSearchParser diff --git a/invenio/ext/elasticsearch/recordext/__init__.py b/invenio/ext/elasticsearch/recordext/__init__.py new file mode 100644 index 000000000..3952cf3b5 --- /dev/null +++ b/invenio/ext/elasticsearch/recordext/__init__.py @@ -0,0 +1,19 @@ +## +## This file is part of Invenio. +## Copyright (C) 2013, 2014 CERN. +## +## Invenio is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 2 of the +## License, or (at your option) any later version. +## +## Invenio is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Invenio; if not, write to the Free Software Foundation, Inc., +## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +"""Basic JsonAlchemy records configuration.""" diff --git a/invenio/ext/elasticsearch/recordext/fields/elasticsearhext.cfg b/invenio/ext/elasticsearch/recordext/fields/elasticsearhext.cfg new file mode 100644 index 000000000..5e6cb1803 --- /dev/null +++ b/invenio/ext/elasticsearch/recordext/fields/elasticsearhext.cfg @@ -0,0 +1,31 @@ +@extend +recid: + elasticsearch: + mapping: { + "recid": { + "type": "integer" + } + } + +@extend +modification_date: + elasticsearch: + mapping: { + "modification_date": { + "type": "date" + } + } + +@extend +creation_date: + elasticsearch: + mapping: { + "modification_date": { + "type": "date" + } + } + +invenio_collections: + calculated: + invenio.ext.elasticsearch.get_col(self.get("recid")) + diff --git a/invenio/ext/elasticsearch/scripts/test_es.py b/invenio/ext/elasticsearch/scripts/test_es.py new file mode 100644 index 000000000..9d07e7de7 --- /dev/null +++ b/invenio/ext/elasticsearch/scripts/test_es.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python +## -*- coding: utf-8 -*- +## +## This file is part of Invenio. +## Copyright (C) 2014 CERN. +## +## Invenio is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 2 of the +## License, or (at your option) any later version. +## +## Invenio is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Invenio; if not, write to the Free Software Foundation, Inc., +## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +# -*- coding: utf-8 -*- + +"""Bad test script.""" + +#---------------------------- Modules ----------------------------------------- + +# import of standard modules +from __future__ import print_function +from optparse import OptionParser + +# third party modules + + +#---------------------------- Main Part --------------------------------------- + +if __name__ == '__main__': + + usage = "usage: %prog [options]" + + parser = OptionParser(usage) + + parser.set_description(""" + This script is done to test elasticserarch for Invenio. + It should be replaced by unittests. + Bascally it create a new index, index 100 records and perform a search + query. + """) + + (options, args) = parser.parse_args() + from invenio.base.factory import create_app + current_app = create_app() + + with current_app.test_request_context(): + print ("-- Connect to the ES server --") + es = current_app.extensions.get("elasticsearch") + + print("-- Delete old index --") + es.delete_index() + + print("-- Create the index --") + es.create_index() + + print("-- Index records --") + es.index_records(range(1, 100), bulk_size=10) + + print("-- Index documents --") + es.index_documents(range(1, 100), bulk_size=10) + + print("-- Index collections --") + es.index_collections(range(1, 100), bulk_size=1000) + + print("-- Perform search --") + #res = es.search(query="fulltext:test") + res = es.search(query="title:Sneutrinos", + facet_filters=[("facet_authors", "Schael, S"), + ("facet_authors", "Bruneliere, R")]) + + print("Hits:") + print ([hit for hit in res.hits]) + + import json + print("Facets:") + print(json.dumps(res.facets.data, indent=2)) + + print("Highlights:") + print(json.dumps(res.highlights.data, indent=2)) diff --git a/invenio/ext/elasticsearch/tasks.py b/invenio/ext/elasticsearch/tasks.py new file mode 100644 index 000000000..8e2358404 --- /dev/null +++ b/invenio/ext/elasticsearch/tasks.py @@ -0,0 +1,40 @@ +## -*- coding: utf-8 -*- +## +## This file is part of Invenio. +## Copyright (C) 2014 CERN. +## +## Invenio is free software; you can redistribute it and/or +## modify it under the terms of the GNU General Public License as +## published by the Free Software Foundation; either version 2 of the +## License, or (at your option) any later version. +## +## Invenio is distributed in the hope that it will be useful, but +## WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +## General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with Invenio; if not, write to the Free Software Foundation, Inc., +## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +"""ES function to submit Celery tasks.""" + +from invenio.celery import celery + + +@celery.task +def index_records(sender, recid): + """Celery function to index records.""" + from flask import current_app + current_app.extensions.get("elasticsearch").index_records([recid]) + #TODO: get_text seems async should be replaced by a signal? + import time + time.sleep(1) + current_app.extensions.get("elasticsearch").index_documents([recid]) + + +@celery.task +def index_collections(sender, collections): + """Celery function to index collections.""" + from flask import current_app + current_app.extensions.get("elasticsearch").index_collections() diff --git a/requirements-elasticsearch.txt b/requirements-elasticsearch.txt new file mode 100644 index 000000000..bfdd4470e --- /dev/null +++ b/requirements-elasticsearch.txt @@ -0,0 +1 @@ +pyelasticsearch>=0.6.1