diff --git a/Procfile b/Procfile index efaf2c6e9..e8adb91a5 100644 --- a/Procfile +++ b/Procfile @@ -1,4 +1,5 @@ web: inveniomanage runserver cache: redis-server worker: celery worker --purge -E -A invenio.celery.celery --loglevel=DEBUG --workdir=$VIRTUAL_ENV workermon: flower --broker=redis://localhost:6379/1 +indexer: elasticsearch --config=elasticsearch.yml diff --git a/Procfile-elasticsearch b/Procfile-elasticsearch deleted file mode 100644 index 6527941c2..000000000 --- a/Procfile-elasticsearch +++ /dev/null @@ -1,5 +0,0 @@ -web: inveniomanage runserver -cache: redis-server -worker: celery worker -E -A invenio.celery.celery --loglevel=INFO --workdir=$VIRTUAL_ENV -workermon: flower --broker=redis://localhost:6379/1 -indexer: elasticsearch -D es.config=invenio/ext/elasticsearch/config/elasticsearch.yml diff --git a/invenio/ext/elasticsearch/config/elasticsearch.yml b/elasticsearch.yml similarity index 77% rename from invenio/ext/elasticsearch/config/elasticsearch.yml rename to elasticsearch.yml index 9ba9b1a7e..55634310d 100644 --- a/invenio/ext/elasticsearch/config/elasticsearch.yml +++ b/elasticsearch.yml @@ -1,370 +1,385 @@ -#################### ElasticSearch Configuration Example ##################### +##################### Elasticsearch Configuration Example ##################### # This file contains an overview of various configuration settings, # targeted at operations staff. Application developers should # consult the guide at . # # The installation procedure is covered at # . # -# ElasticSearch comes with reasonable defaults for most settings, +# Elasticsearch comes with reasonable defaults for most settings, # so you can try it out without bothering with configuration. # # Most of the time, these defaults are just fine for running a production # cluster. If you're fine-tuning your cluster, or wondering about the # effect of certain configuration option, please _do ask_ on the # mailing list or IRC channel [http://elasticsearch.org/community]. # Any element in the configuration can be replaced with environment variables # by placing them in ${...} notation. For example: # -# node.rack: ${RACK_ENV_VAR} +#node.rack: ${RACK_ENV_VAR} # For information on supported formats and syntax for the config file, see # ################################### Cluster ################################### # Cluster name identifies your cluster for auto-discovery. If you're running # multiple clusters on the same network, make sure you're using unique names. # -# cluster.name: elasticsearch cluster.name: invenio #################################### Node ##################################### # Node names are generated dynamically on startup, so you're relieved # from configuring them manually. You can tie this node to a specific name: # -# node.name: "Franz Kafka" -node.name: "node1" +#node.name: "Franz Kafka" # Every node can be configured to allow or deny being eligible as the master, # and to allow or deny to store the data. # # Allow this node to be eligible as a master node (enabled by default): # -# node.master: true +#node.master: true # # Allow this node to store data (enabled by default): # -# node.data: true +#node.data: true # You can exploit these settings to design advanced cluster topologies. # # 1. You want this node to never become a master node, only to hold data. # This will be the "workhorse" of your cluster. # -# node.master: false -# node.data: true +#node.master: false +#node.data: true # # 2. You want this node to only serve as a master: to not store any data and # to have free resources. This will be the "coordinator" of your cluster. # -# node.master: true -# node.data: false +#node.master: true +#node.data: false # # 3. You want this node to be neither master nor data node, but # to act as a "search load balancer" (fetching data from nodes, # aggregating results, etc.) # -# node.master: false -# node.data: false +#node.master: false +#node.data: false # Use the Cluster Health API [http://localhost:9200/_cluster/health], the -# Node Info API [http://localhost:9200/_cluster/nodes] or GUI tools -# such as and +# Node Info API [http://localhost:9200/_nodes] or GUI tools +# such as , +# , +# and # to inspect the cluster state. # A node can have generic attributes associated with it, which can later be used # for customized shard allocation filtering, or allocation awareness. An attribute # is a simple key value pair, similar to node.key: value, here is an example: # -# node.rack: rack314 +#node.rack: rack314 # By default, multiple nodes are allowed to start from the same installation location # to disable it, set the following: -# node.max_local_storage_nodes: 1 +#node.max_local_storage_nodes: 1 #################################### Index #################################### # You can set a number of options (such as shard/replica options, mapping # or analyzer definitions, translog settings, ...) for indices globally, # in this file. # # Note, that it makes more sense to configure index settings specifically for # a certain index, either when creating it or by using the index templates API. # # See and # # for more information. # Set the number of shards (splits) of an index (5 by default): # -# index.number_of_shards: 5 +#index.number_of_shards: 5 # Set the number of replicas (additional copies) of an index (1 by default): # -# index.number_of_replicas: 1 +#index.number_of_replicas: 1 # Note, that for development on a local machine, with small indices, it usually # makes sense to "disable" the distributed features: # -# index.number_of_shards: 1 -# index.number_of_replicas: 0 +#index.number_of_shards: 1 +#index.number_of_replicas: 0 # These settings directly affect the performance of index and search operations # in your cluster. Assuming you have enough machines to hold shards and # replicas, the rule of thumb is: # # 1. Having more *shards* enhances the _indexing_ performance and allows to # _distribute_ a big index across machines. # 2. Having more *replicas* enhances the _search_ performance and improves the # cluster _availability_. # # The "number_of_shards" is a one-time setting for an index. # # The "number_of_replicas" can be increased or decreased anytime, # by using the Index Update Settings API. # -# ElasticSearch takes care about load balancing, relocating, gathering the +# Elasticsearch takes care about load balancing, relocating, gathering the # results from nodes, etc. Experiment with different settings to fine-tune # your setup. # Use the Index Status API () to inspect # the index status. #################################### Paths #################################### # Path to directory containing configuration (this file and logging.yml): # -# path.conf: /path/to/conf +#path.conf: /path/to/conf # Path to directory where to store index data allocated for this node. # -# path.data: /path/to/data +path.data: /usr/local/var/elasticsearch/ # # Can optionally include more than one location, causing data to be striped across # the locations (a la RAID 0) on a file level, favouring locations with most free # space on creation. For example: # -# path.data: /path/to/data1,/path/to/data2 +#path.data: /path/to/data1,/path/to/data2 # Path to temporary files: # -# path.work: /path/to/work +#path.work: /path/to/work # Path to log files: # -# path.logs: /path/to/logs +path.logs: /usr/local/var/log/elasticsearch/ # Path to where plugins are installed: # -# path.plugins: /path/to/plugins +path.plugins: /usr/local/var/lib/elasticsearch/plugins #################################### Plugin ################################### # If a plugin listed here is not installed for current node, the node will not start. # -# plugin.mandatory: mapper-attachments,lang-groovy +#plugin.mandatory: mapper-attachments,lang-groovy ################################### Memory #################################### -# ElasticSearch performs poorly when JVM starts swapping: you should ensure that +# Elasticsearch performs poorly when JVM starts swapping: you should ensure that # it _never_ swaps. # # Set this property to true to lock the memory: # -# bootstrap.mlockall: true -bootstrap.mlockall: true +#bootstrap.mlockall: true # Make sure that the ES_MIN_MEM and ES_MAX_MEM environment variables are set # to the same value, and that the machine has enough memory to allocate -# for ElasticSearch, leaving enough memory for the operating system itself. +# for Elasticsearch, leaving enough memory for the operating system itself. # -# You should also make sure that the ElasticSearch process is allowed to lock +# You should also make sure that the Elasticsearch process is allowed to lock # the memory, eg. by using `ulimit -l unlimited`. ############################## Network And HTTP ############################### -# ElasticSearch, by default, binds itself to the 0.0.0.0 address, and listens +# Elasticsearch, by default, binds itself to the 0.0.0.0 address, and listens # on port [9200-9300] for HTTP traffic and on port [9300-9400] for node-to-node # communication. (the range means that if the port is busy, it will automatically # try the next port). # Set the bind address specifically (IPv4 or IPv6): # -# network.bind_host: 192.168.0.1 +#network.bind_host: 192.168.0.1 # Set the address other nodes will use to communicate with this node. If not # set, it is automatically derived. It must point to an actual IP address. # -# network.publish_host: 192.168.0.1 +#network.publish_host: 192.168.0.1 # Set both 'bind_host' and 'publish_host': # -# network.host: 192.168.0.1 +network.host: 127.0.0.1 # Set a custom port for the node to node communication (9300 by default): # -# transport.tcp.port: 9300 -transport.tcp.port: 9300 +#transport.tcp.port: 9300 # Enable compression for all communication between nodes (disabled by default): # -# transport.tcp.compress: true +#transport.tcp.compress: true # Set a custom port to listen for HTTP traffic: # -# http.port: 9200 -http.port: 9200 +#http.port: 9200 # Set a custom allowed content length: # -# http.max_content_length: 100mb +#http.max_content_length: 100mb # Disable HTTP completely: # -# http.enabled: false +#http.enabled: false ################################### Gateway ################################### # The gateway allows for persisting the cluster state between full cluster # restarts. Every change to the state (such as adding an index) will be stored # in the gateway, and when the cluster starts up for the first time, # it will read its state from the gateway. # There are several types of gateway implementations. For more information, see # . # The default gateway type is the "local" gateway (recommended): # -# gateway.type: local +#gateway.type: local # Settings below control how and when to start the initial recovery process on # a full cluster restart (to reuse as much local data as possible when using shared # gateway). # Allow recovery process after N nodes in a cluster are up: # -# gateway.recover_after_nodes: 1 +#gateway.recover_after_nodes: 1 # Set the timeout to initiate the recovery process, once the N nodes # from previous setting are up (accepts time value): # -# gateway.recover_after_time: 5m +#gateway.recover_after_time: 5m # Set how many nodes are expected in this cluster. Once these N nodes # are up (and recover_after_nodes is met), begin recovery process immediately # (without waiting for recover_after_time to expire): # -# gateway.expected_nodes: 2 +#gateway.expected_nodes: 2 ############################# Recovery Throttling ############################# # These settings allow to control the process of shards allocation between # nodes during initial recovery, replica allocation, rebalancing, # or when adding and removing nodes. # Set the number of concurrent recoveries happening on a node: # # 1. During the initial recovery # -# cluster.routing.allocation.node_initial_primaries_recoveries: 4 +#cluster.routing.allocation.node_initial_primaries_recoveries: 4 # # 2. During adding/removing nodes, rebalancing, etc # -# cluster.routing.allocation.node_concurrent_recoveries: 2 +#cluster.routing.allocation.node_concurrent_recoveries: 2 # Set to throttle throughput when recovering (eg. 100mb, by default 20mb): # -# indices.recovery.max_bytes_per_sec: 20mb +#indices.recovery.max_bytes_per_sec: 20mb # Set to limit the number of open concurrent streams when # recovering a shard from a peer: # -# indices.recovery.concurrent_streams: 5 +#indices.recovery.concurrent_streams: 5 ################################## Discovery ################################## # Discovery infrastructure ensures nodes can be found within a cluster # and master node is elected. Multicast discovery is the default. # Set to ensure a node sees N other master eligible nodes to be considered -# operational within the cluster. Its recommended to set it to a higher value -# than 1 when running more than 2 nodes in the cluster. +# operational within the cluster. This should be set to a quorum/majority of +# the master-eligible nodes in the cluster. # -# discovery.zen.minimum_master_nodes: 1 +#discovery.zen.minimum_master_nodes: 1 # Set the time to wait for ping responses from other nodes when discovering. # Set this option to a higher value on a slow or congested network # to minimize discovery failures: # -# discovery.zen.ping.timeout: 3s +#discovery.zen.ping.timeout: 3s # For more information, see # # Unicast discovery allows to explicitly control which nodes will be used # to discover the cluster. It can be used when multicast is not present, # or to restrict the cluster communication-wise. # # 1. Disable multicast discovery (enabled by default): # -# discovery.zen.ping.multicast.enabled: false -discovery.zen.ping.multicast.enabled: false +#discovery.zen.ping.multicast.enabled: false # # 2. Configure an initial list of master nodes in the cluster # to perform discovery when new nodes (master or data) are started: # -# discovery.zen.ping.unicast.hosts: ["host1", "host2:port"] +#discovery.zen.ping.unicast.hosts: ["host1", "host2:port"] # EC2 discovery allows to use AWS EC2 API in order to perform discovery. # # You have to install the cloud-aws plugin for enabling the EC2 discovery. # # For more information, see # # # See # for a step-by-step tutorial. +# GCE discovery allows to use Google Compute Engine API in order to perform discovery. +# +# You have to install the cloud-gce plugin for enabling the GCE discovery. +# +# For more information, see . + +# Azure discovery allows to use Azure API in order to perform discovery. +# +# You have to install the cloud-azure plugin for enabling the Azure discovery. +# +# For more information, see . ################################## Slow Log ################################## # Shard level query and fetch threshold logging. #index.search.slowlog.threshold.query.warn: 10s #index.search.slowlog.threshold.query.info: 5s #index.search.slowlog.threshold.query.debug: 2s #index.search.slowlog.threshold.query.trace: 500ms #index.search.slowlog.threshold.fetch.warn: 1s #index.search.slowlog.threshold.fetch.info: 800ms #index.search.slowlog.threshold.fetch.debug: 500ms #index.search.slowlog.threshold.fetch.trace: 200ms #index.indexing.slowlog.threshold.index.warn: 10s #index.indexing.slowlog.threshold.index.info: 5s #index.indexing.slowlog.threshold.index.debug: 2s #index.indexing.slowlog.threshold.index.trace: 500ms ################################## GC Logging ################################ #monitor.jvm.gc.young.warn: 1000ms #monitor.jvm.gc.young.info: 700ms #monitor.jvm.gc.young.debug: 400ms #monitor.jvm.gc.old.warn: 10s #monitor.jvm.gc.old.info: 5s #monitor.jvm.gc.old.debug: 2s + +################################## Security ################################ + +# Uncomment if you want to enable JSONP as a valid return transport on the +# http server. With this enabled, it may pose a security risk, so disabling +# it unless you need it is recommended (it is disabled by default). +# +#http.jsonp.enable: true diff --git a/invenio/ext/elasticsearch/__init__.py b/invenio/ext/elasticsearch/__init__.py deleted file mode 100644 index 8bf4f4880..000000000 --- a/invenio/ext/elasticsearch/__init__.py +++ /dev/null @@ -1,862 +0,0 @@ -# -# This file is part of Invenio. -# Copyright (C) 2013, 2014 CERN. -# -# Invenio is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of the -# License, or (at your option) any later version. -# -# Invenio is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio; if not, write to the Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -""" Elasticsearch extension for Invenio. - -invenio.ext.elasticsearch -------------------------- -Elasticsearch a search engine for Invenio. - -It should be able to perform: - - metadata and fulltext search almost without DB - - metadata facets such as authors, Invenio collection facets both with the - corresponding filters - - fulltext and metadata fields highlightings - - fast collection indexing, mid-fast metadata indexing, almost fast fulltext - indexing - - -Requirements -^^^^^^^^^^^^ - -Elasticsearch >= 1.0, pyelasticsearch. - - -Installation (for developpement) -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Download the tarball form http://elasticsearch.org, uncompress it and add the -bin/ directory in your path. -Go to the invenio src directory and run ``honcho start`` or equivalent. - - -Deployment -^^^^^^^^^^^ - -... - -Testing -^^^^^^^ - -This extension is working with the demosite. Indexing is done automagicaly -using webcoll/bibupload signals. Note: bibindex is not required. - -Usage -^^^^^ - >>> es = current_app.extensions.get("elasticsearch") - >>> res = es.search(query="title:Sneutrinos", - facet_filters=[("facet_authors", "Schael, S"), - ("facet_authors", "Bruneliere, R")]) - -see: `a simple of search interface `_ -or invenio/ext/elasticsearch/scripts/test_es.py for a complete manual -indexing and searching example. - -TODO: - - adding exceptions - - decide if we create one ES document type for each JsonAlchemy document type - - convert an Invenio query into a ES query - - add sort options in JsonAchemy - - check collection access restriction with collection filters - - probably needs a collection exclusion list as search params - - Note: file access restriction is not managed by the indexer - - multi-lingual support (combo plugin, in config files) - - term boosting configuration (in JsonAlchemy?) - - search by marc field support? - - test hierachical collections - - test similar documents - - and many more... -""" -from werkzeug.utils import cached_property -from pyelasticsearch import ElasticSearch as PyElasticSearch - - -class ElasticSearch(object): - - """ - Flask extension. - - Initialization of the extension: - - >>> from flask import Flask - >>> from flask_elasticsearch import ElasticSearch - >>> app = Flask('myapp') - >>> s = ElasticSearch(app=app) - - or alternatively using the factory pattern: - - >>> app = Flask('myapp') - >>> s = ElasticSearch() - >>> s.init_app(app) - """ - - def __init__(self, app=None): - """Build the extension object.""" - self.app = app - - #default process functions - self.process_query = lambda x: x - self.process_results = lambda x: x - - # TODO: to put in config? - self.records_doc_type = "records" - self.documents_doc_type = "documents" - self.collections_doc_type = "collections" - - # to cache recids collections - self._recids_collections = {} - - if app is not None: - self.init_app(app) - - def init_app(self, app): - """ - Initialize a Flask application. - - Only one Registry per application is allowed. - """ - app.config.setdefault('ELASTICSEARCH_URL', 'http://localhost:9200/') - app.config.setdefault('ELASTICSEARCH_INDEX', "invenio") - app.config.setdefault('ELASTICSEARCH_NUMBER_OF_SHARDS', 1) - app.config.setdefault('ELASTICSEARCH_NUMBER_OF_REPLICAS', 0) - app.config.setdefault('ELASTICSEARCH_DATE_DETECTION', False) - app.config.setdefault('ELASTICSEARCH_NUMERIC_DETECTION', False) - app.config.setdefault('ELASTICSEARCH_ANALYSIS', { - "default": {"type": "simple"}}) - - # Follow the Flask guidelines on usage of app.extensions - if not hasattr(app, 'extensions'): - app.extensions = {} - if 'elasticsearch' in app.extensions: - raise Exception("Flask application already initialized") - - app.extensions['elasticsearch'] = self - self.app = app - - @cached_property - def connection(self): - """Return a pyelasticsearch connection object.""" - return PyElasticSearch(self.app.config['ELASTICSEARCH_URL']) - - def set_query_handler(self, handler): - """ - Specify a function to convert the invenio query into a ES query. - - :param handler: [function] take a query[string] parameter - """ - self.process_query = handler - - def set_results_handler(self, handler): - """ - Set a function to process the search results. - - To convert ES search results into an object understandable by Invenio. - - :param handler: [function] take a query[string] parameter - """ - self.process_results = handler - - @property - def status(self): - """The status of the ES cluster. - - See: http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/cluster-health.html - for more. - - TODO: is it useful? - - :return: [string] possible values: green, yellow, red. green means all - ok including replication, yellow means replication not active, red - means partial results. - """ - return self.connection.health().get("status") - - def index_exists(self, index=None): - """Check if the index exists in the cluster. - - :param index: [string] index name - - :return: [bool] True if exists - """ - if index is None: - index = self.app.config['ELASTICSEARCH_INDEX'] - if self.connection.status().get("indices").get(index): - return True - return False - - def delete_index(self, index=None): - """Delete the given index. - - :param index: [string] index name - - :return: [bool] True if success - """ - if index is None: - index = self.app.config['ELASTICSEARCH_INDEX'] - try: - self.connection.delete_index(index=index) - return True - except: - return False - - def create_index(self, index=None): - """Create the given index. - - Also set basic configuration and doc type mappings. - - :param index: [string] index name - - :return: [bool] True if success - """ - if index is None: - index = self.app.config['ELASTICSEARCH_INDEX'] - if self.index_exists(index=index): - return True - try: - #create index - index_settings = { - #should be set to 1 for exact facet count - "number_of_shards": - self.app.config['ELASTICSEARCH_NUMBER_OF_SHARDS'], - - #in case of primary shard failed - "number_of_replicas": - self.app.config['ELASTICSEARCH_NUMBER_OF_REPLICAS'], - - #disable automatic type detection - #that can cause errors depending of the indexing order - "date_detection": - self.app.config['ELASTICSEARCH_DATE_DETECTION'], - "numeric_detection": - self.app.config['ELASTICSEARCH_NUMERIC_DETECTION'] - } - if self.app.config['ELASTICSEARCH_ANALYSIS']: - index_settings["analysis"] = \ - self.app.config['ELASTICSEARCH_ANALYSIS'] - - self.connection.create_index(index=index, settings=index_settings) - - from es_config import get_records_fields_config, \ - get_documents_fields_config, \ - get_collections_fields_config - #mappings - self._mapping(index=index, doc_type=self.records_doc_type, - fields_mapping=get_records_fields_config()) - - self._mapping(index=index, doc_type=self.documents_doc_type, - fields_mapping=get_documents_fields_config(), - parent_type=self.records_doc_type) - - self._mapping(index=index, doc_type=self.collections_doc_type, - fields_mapping=get_collections_fields_config(), - parent_type=self.records_doc_type) - - return True - except: - return False - - def _mapping(self, index, doc_type, fields_mapping, parent_type=None): - mapping = { - doc_type: { - "properties": fields_mapping - } - } - - # specific conf for join like query - if parent_type: - mapping[doc_type]["_parent"] = {"type": parent_type} - try: - self.connection.put_mapping(index=index, doc_type=doc_type, - mapping=mapping) - return True - except: - return False - - def _bulk_index_docs(self, docs, doc_type, index): - if not docs: - return [] - self.app.logger.info("Indexing: %d records for %s" % (len(docs), - doc_type)) - results = self.connection.bulk_index(index=index, - doc_type=doc_type, docs=docs, - id_field='_id', - refresh=self.app.config.get("DEBUG")) - errors = [] - for it in results.get("items"): - if it.get("index").get("error"): - errors.append((it.get("index").get("_id"), it.get("index").get("error"))) - return errors - - def _documents_has_been_updated(self, recid): - from invenio.legacy.bibdocfile.api import BibRecDocs - import datetime - - bibdocs = BibRecDocs(recid) - #TODO: replace legacy code - from invenio.legacy.dbquery import run_sql - (record_creation_date, record_modification_date) = \ - run_sql("SELECT creation_date, modification_date from bibrec where id=%s" - % (recid))[0] - - #wait for a JsonAlchemy bug resolution - #record = self._get_record(recid) - - #record_modification_date = \ - # datetime.datetime.strptime(record.get("modification_date"), - # "%Y-%m-%dT%H:%M:%S") - #record_creation_date = \ - # datetime.datetime.strptime(record.get("creation_date"), - # "%Y-%m-%dT%H:%M:%S.%f") - if not bibdocs.list_bibdocs(): - self.app.logger.debug("No docs for: %s" % recid) - for b in bibdocs.list_bibdocs(): - #should add fews seconds for rounding problem - if b.md + datetime.timedelta(seconds=2) >= record_modification_date: - return True - return False - - def _get_record(self, recid): - from invenio.modules.records.api import get_record - record_as_dict = get_record(recid, reset_cache=True).dumps() - del record_as_dict["__meta_metadata__"] - return record_as_dict - - def _get_text(self, recid): - from invenio.legacy.bibdocfile.api import BibRecDocs - text = BibRecDocs(recid).get_text(True) - if not text: - self.app.logger.debug("No text for:%s" % recid) - return None - return { - "fulltext": text, - "recid": recid, - "_id": recid, - "_parent": recid} - - def _get_collections(self, recid): - return { - "recid": recid, - "_id": recid, - "_parent": recid, - "name": self._recids_collections.get(recid, "")} - - def get_all_collections_for_records(self, recreate_cache_if_needed=True): - """Return a dict with recid as key and collection list as value. - - This replace existing Invenio function for performance reason. - - :param recreate_cache_if_needed: [bool] True if regenerate the cache - """ - from invenio.legacy.search_engine import collection_reclist_cache, get_collection_reclist - from invenio.legacy.websearch.webcoll import Collection - ret = {} - - #update the cache? - if recreate_cache_if_needed: - collection_reclist_cache.recreate_cache_if_needed() - - for name in collection_reclist_cache.cache.keys(): - recids = get_collection_reclist(name, recreate_cache_if_needed=False) - full_path_name = "/".join([v.name for v in - Collection(name).get_ancestors()] + - [name]) - for recid in recids: - ret.setdefault(recid, []).append(full_path_name) - self._recids_collections = ret - - def index_collections(self, recids=None, index=None, bulk_size=100000, **kwargs): - """Index collections. - - Collections maps computed by webcoll is indexed into the given index in - order to allow filtering by collection. - - :param recids: [list of int] recids to index - :param index: [string] index name - :param bulk_size: [int] batch size to index - - :return: [list of int] list of recids not indexed due to errors - """ - self.get_all_collections_for_records() - if not recids: - recids = self._recids_collections.keys() - if index is None: - index = self.app.config['ELASTICSEARCH_INDEX'] - return self._index_docs(recids, self.collections_doc_type, index, - bulk_size, self._get_collections) - - def index_documents(self, recids, index=None, bulk_size=100000, **kwargs): - """Index fulltext files. - - Put the fullext extracted by Invenio into the given index. - - :param recids: [list of int] recids to index - :param index: [string] index name - :param bulk_size: [int] batch size to index - - :return: [list of int] list of recids not indexed due to errors - """ - if index is None: - index = self.app.config['ELASTICSEARCH_INDEX'] - recids_to_index = filter(self._documents_has_been_updated, recids) - if recids_to_index: - self.app.logger.debug("Indexing document for %s" % recids) - return self._index_docs(recids_to_index, self.documents_doc_type, index, - bulk_size, self._get_text) - - def index_records(self, recids, index=None, bulk_size=100000, **kwargs): - """Index bibliographic records. - - The document structure is provided by JsonAlchemy. - - Note: the __metadata__ is removed for the moment. - - TODO: is should be renamed as index? - - :param recids: [list of int] recids to index - :param index: [string] index name - :param bulk_size: [int] batch size to index - - :return: [list] list of recids not indexed due to errors - """ - if index is None: - index = self.app.config['ELASTICSEARCH_INDEX'] - return self._index_docs(recids, self.records_doc_type, index, - bulk_size, self._get_record) - - def _index_docs(self, recids, doc_type, index, bulk_size, get_docs): - docs = [] - errors = [] - for recid in recids: - doc = get_docs(recid) - if doc: - docs.append(doc) - if len(docs) >= bulk_size: - errors += self._bulk_index_docs(docs, doc_type=doc_type, - index=index) - docs = [] - errors += self._bulk_index_docs(docs, doc_type=doc_type, index=index) - return errors - - def find_similar(self, recid, index=None, **kwargs): - """Find simlar documents to the given recid. - - TODO: tests - - :param recid: [int] document id to find similar - :param index: [string] index name - - :return: [list] list of recids - """ - if index is None: - index = self.app.config['ELASTICSEARCH_INDEX'] - fields_to_compute_similarity = ["_all"] - return self.connection.more_like_this(index=index, - doc_type=self.records_doc_type, - id=recid, - mlt_fields=fields_to_compute_similarity) - - def _search_records(self, query=None, index=None, main_options={}, sort_options={}, - facets={}, highlights={}, facet_filters=[]): - """Perform search on records. - - It compute: - - hits - - metadata facets - - metadata highlights - - :param query: [nested dict] ES query derived from Invenio query - :param index: [string] index name - :param main_options: [nested dict] ES main options such as paging - :param sort_options: [nested dict] ES sort options - :param facets: [nested dict] ES facets configuration - :param highlights: [nested dict] ES highlight configuration - :param facet_filters: [nested dict] ES facet filters, i.e. when the - user click on the facet term - - Here is the basic form of the query: - { - #here search options such as paging - "query": { - "filtered" : { - "query": { - "bool": { - #this is valid for a pure textual query, will - #becomes a more complexe query with a Invenio to ES - #converter see: - #http://pythonhosted.org/Whoosh/parsing.html for - #create_create_basic_search_unit - #inspiration - "should": [{ - #records query - }, - { - #fulltext is on query part as is a part of ranking - "has_child": { - "type": "documents", - "query": {#children query} - } - }], - # or condition - "minimum_should_match" : 1 - } - } - }, - "filter": { - "bool": { - "must": [ - #facet filters including collection filters using has_child - ] - } - } - }, - "sort": {#sort options}, - "highlights": {#hightlight options}, - "facets": {#facets options without facet_filter as it is done on the query part} - - } - """ - if not query: - query = { - "query": { - "match_all": {} - } - } - es_query = {} - es_query.update(main_options) - if facet_filters: - es_query.update({ - "query": { - "filtered": { - "query": query.get("query"), - "filter": { - "bool": { - "must": facet_filters - } - } - } - } - }) - else: - es_query.update(query) - es_query.update(sort_options) - es_query.update(facets) - es_query.update(highlights) - results = self.process_results(self.connection.search(es_query, - index=index, - doc_type=self.records_doc_type)) - return (results, es_query) - - def _search_documents(self, query, index, filtered_ids): - """Preform a search query to extract hilights from documents. - - :param query: [nested dict] ES query derived from Invenio query - :param index: [string] index name - :param filtered_ids: [list of int] list of record ids return by the - records search query - - :return: [object] response - - Here is the basic form of the query: - { - "size": 10, - "fields": [], - "query": { - "filtered": { - "query": {#fulltext query similar than records query with has_child -> has_parent}, - "filter": { - "ids": { - "values": #ids returned by records search - } - } - } - }, - "highlight": {#fulltext highlights config} - } - """ - from es_config import get_documents_highlights_config - documents_query = { - "size": 10, - "fields": [], - "query": { - "filtered": { - "query": { - "query_string": { - "default_field": "fulltext", - "query": query}}, - "filter": { - "ids": { - "values": filtered_ids - } - } - } - }, - "highlight": get_documents_highlights_config() - } - - #submit query for fulltext highlighting - return self.process_results(self.connection.search(documents_query, - index=index, - doc_type=self.documents_doc_type)) - - def _search_collections(self, records_query, index, include_collections): - """Perform a search query to extract hilights from documents. - - :param records_query: [nested dict] query used to search into records - :param index: [string] index name - :param exclude_collections: [list of strings] collection name to exclude to facets - - :return: [object] response - - Here is the basic form of the query: - { - #no results only facets/aggregators computation - "size": 0, - "query": { - "has_parent": { - "parent_type" : "records", - "query": #same as records query - }, - }, - "aggs": { - "collections": { - "terms" : { - "field" : "name", - #options - "min_doc_count": 1, - "order" : { "_term" : "asc" }, - } - } - } - } - """ - collections_query = { - "size": 0, - "query": { - "has_parent": { - "parent_type": "records", - "query": records_query.get("query") - } - }, - "aggs": { - "collections": { - "terms": { - "field": "name", - "min_doc_count": 1, - "order": {"_term": "asc"} - } - } - } - } - - return self.process_results(self.connection.search(collections_query, - index=index, - doc_type=self.collections_doc_type)) - - def search(self, query, index=None, cc=None, f="", rg=None, - sf=None, so="d", jrec=0, facet_filters=[], **kwargs): - """Perform a search query. - - Note: a lot of work to do. - - :param query: [string] search query - :param recids: [list of int] recids to index - :param index: [string] index name - :param cc: [string] main collection name - :param f: [string] field to search (not yet used) - :param rg: [int] number of results to return - :param sf: [string] sort field - :param so: [string] sort order in [d,a] - :param jrec: [int] result offset for paging - :param facet_filters: [list of tupple of strings] filters to prune the - results. Each filter is defined as a tupple of term, value: (i.e. - [("facet_authors", "Ellis, J.")]) - - :return: [object] response - """ - if index is None: - index = self.app.config['ELASTICSEARCH_INDEX'] - - if cc is None: - cc = self.app.config['CFG_SITE_NAME'] - - if rg is None: - rg = int(self.app.config['CFG_WEBSEARCH_DEF_RECORDS_IN_GROUPS']) - #converted Invenio query - es_query = self.process_query(query) - - #search main options - main_options = { - "size": rg, - "from": jrec, - "fields": [] - } - - # sorting - sort_options = {} - if sf: - sort_options = { - "sort": [{ - "sort_%s" % sf: { - "order": "desc" if so == "d" else "asc" - } - }] - } - - # facet_filters - include_collection = [] - #es_filters = [] - es_filters = [{ - "has_child": { - "type": self.collections_doc_type, - "filter": { - "term": { - "name": { - "value": cc - } - } - } - } - }] - for ft in facet_filters: - (term, value) = ft - if term == "facet_collections": - include_collection.append(value) - es_filters.append({ - "has_child": { - "type": self.collections_doc_type, - "filter": { - "term": { - "name": { - "value": value - } - } - } - } - }) - else: - es_filters.append({ - "term": { - term: value - } - }) - if not include_collection: - include_collection = [cc] - - # facet configuration - from es_config import get_records_facets_config - facets = { - "aggs": get_records_facets_config() - } - - # hightlight configuration - from es_config import get_records_highlights_config - highlights = { - "highlight": get_records_highlights_config() - } - - (results, records_query) = self._search_records(query=es_query, index=index, - main_options=main_options, - sort_options=sort_options, - facets=facets, - highlights=highlights, - facet_filters=es_filters) - - #build query for fulltext highlighting - matched_ids = [recid for recid in results.hits] - hi_results = self._search_documents(query=query, index=index, filtered_ids=matched_ids) - - #merge with existing metadata highlights - for recid, hi in hi_results.highlights.iteritems(): - results.highlights.data[int(recid)].update(hi) - - #compute facets for collections - cols_results = self._search_collections(records_query=records_query, - index=index, - include_collections=include_collection) - results.facets.update(cols_results.facets) - - return results - - -def index_record(sender, recid): - """ - Index a given record. - - Used to connect to signal. - - :param recid: [int] recid to index - """ - from .tasks import index_records - return index_records.delay(sender, recid) - - -def index_collections(sender, collections): - """ - Index a given ghe collection. - - Used to connect to signal. - - Note: all collections are indexed as it is fast. - :param collections: [list of string] collection names - """ - from .tasks import index_collections - return index_collections.delay(sender, []) - - -def drop_index(sender, *args, **kwargs): - """ - Remove the elasticsearch index. - - Used to connect to signal. - """ - from flask import current_app - es = current_app.extensions.get("elasticsearch") - es.delete_index() - - -def create_index(sender, *args, **kwargs): - """ - Create the elasticsearch index. - - Index creation, settings and mapping. - - Used to connect to signal. - """ - from flask import current_app - es = current_app.extensions.get("elasticsearch") - es.delete_index() - es.create_index() - - -def setup_app(app): - """Set up the extension for the given app.""" - from es_query import process_es_query, process_es_results - es = ElasticSearch(app) - es.set_query_handler(process_es_query) - es.set_results_handler(process_es_results) - - app.extensions["registry"]["packages"].register("invenio.ext.elasticsearch") - from invenio.base import signals - signals.record_after_update.connect(index_record) - signals.record_after_create.connect(index_record) - signals.webcoll_after_reclist_cache_update.connect(index_collections) - from invenio.base.scripts.database import recreate, drop, create - signals.pre_command.connect(drop_index, sender=drop) - signals.post_command.connect(create_index, sender=create) - signals.pre_command.connect(drop_index, sender=recreate) - signals.post_command.connect(create_index, sender=recreate) diff --git a/invenio/ext/elasticsearch/config/__init__.py b/invenio/ext/elasticsearch/config/__init__.py deleted file mode 100644 index 26aaae802..000000000 --- a/invenio/ext/elasticsearch/config/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# -# This file is part of Invenio. -# Copyright (C) 2013, 2014 CERN. -# -# Invenio is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of the -# License, or (at your option) any later version. -# -# Invenio is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio; if not, write to the Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -"""Config part of elasticsearch Invenio extension.""" diff --git a/invenio/ext/elasticsearch/config/invenio_stop.txt b/invenio/ext/elasticsearch/config/invenio_stop.txt deleted file mode 100644 index 9160c449e..000000000 --- a/invenio/ext/elasticsearch/config/invenio_stop.txt +++ /dev/null @@ -1,783 +0,0 @@ -and -will -their -then -there -these -that -they -or -as -a -but -in -for -the -of -no -into -be -with -not -by -on -at -was -to -such -it -if -is -are -this - -à -â -abord -afin -ah -ai -aie -ainsi -allaient -allo -allô -allons -après -assez -attendu -au -aucun -aucune -aujourd -aujourd'hui -auquel -aura -auront -aussi -autre -autres -aux -auxquelles -auxquels -avaient -avais -avait -avant -avec -avoir -ayant -b -bah -beaucoup -bien -bigre -boum -bravo -brrr -c -ça -car -ce -ceci -cela -celle -celle-ci -celle-là -celles -celles-ci -celles-là -celui -celui-ci -celui-là -cent -cependant -certain -certaine -certaines -certains -certes -ces -cet -cette -ceux -ceux-ci -ceux-là -chacun -chaque -cher -chère -chères -chers -chez -chiche -chut -ci -cinq -cinquantaine -cinquante -cinquantième -cinquième -clac -clic -combien -comme -comment -compris -concernant -contre -couic -crac -d -da -dans -de -debout -dedans -dehors -delà -depuis -derrière -des -dès -désormais -desquelles -desquels -dessous -dessus -deux -deuxième -deuxièmement -devant -devers -devra -différent -différente -différentes -différents -dire -divers -diverse -diverses -dix -dix-huit -dixième -dix-neuf -dix-sept -doit -doivent -donc -dont -douze -douzième -dring -du -duquel -durant -e -effet -eh -elle -elle-même -elles -elles-mêmes -en -encore -entre -envers -environ -es -ès -est -et -etant -étaient -étais -était -étant -etc -été -etre -être -eu -euh -eux -eux-mêmes -excepté -f -façon -fais -faisaient -faisant -fait -feront -fi -flac -floc -font -g -gens -h -ha -hé -hein -hélas -hem -hep -hi -ho -holà -hop -hormis -hors -hou -houp -hue -hui -huit -huitième -hum -hurrah -i -il -ils -importe -j -je -jusqu -jusque -k -l -la -là -laquelle -las -le -lequel -les -lès -lesquelles -lesquels -leur -leurs -longtemps -lorsque -lui -lui-même -m -ma -maint -mais -malgré -me -même -mêmes -merci -mes -mien -mienne -miennes -miens -mille -mince -moi -moi-même -moins -mon -moyennant -n -na -ne -néanmoins -neuf -neuvième -ni -nombreuses -nombreux -non -nos -notre -nôtre -nôtres -nous -nous-mêmes -nul -o -o| -ô -oh -ohé -olé -ollé -on -ont -onze -onzième -ore -ou -où -ouf -ouias -oust -ouste -outre -p -paf -pan -par -parmi -partant -particulier -particulière -particulièrement -pas -passé -pendant -personne -peu -peut -peuvent -peux -pff -pfft -pfut -pif -plein -plouf -plus -plusieurs -plutôt -pouah -pour -pourquoi -premier -première -premièrement -près -proche -psitt -puisque -q -qu -quand -quant -quanta -quant-à-soi -quarante -quatorze -quatre -quatre-vingt -quatrième -quatrièmement -que -quel -quelconque -quelle -quelles -quelque -quelques -quelqu'un -quels -qui -quiconque -quinze -quoi -quoique -r -revoici -revoilà -rien -s -sa -sacrebleu -sans -sapristi -sauf -se -seize -selon -sept -septième -sera -seront -ses -si -sien -sienne -siennes -siens -sinon -six -sixième -soi -soi-même -soit -soixante -son -sont -sous -stop -suis -suivant -sur -surtout -t -ta -tac -tant -te -té -tel -telle -tellement -telles -tels -tenant -tes -tic -tien -tienne -tiennes -tiens -toc -toi -toi-même -ton -touchant -toujours -tous -tout -toute -toutes -treize -trente -très -trois -troisième -troisièmement -trop -tsoin -tsouin -tu -u -un -une -unes -uns -v -va -vais -vas -vé -vers -via -vif -vifs -vingt -vivat -vive -vives -vlan -voici -voilà -vont -vos -votre -vôtre -vôtres -vous -vous-mêmes -vu -w -x -y -z -zut -aber -als -also -am -an -auch -auf -aus -bei -beim -bis -da -damit -dann -das -daß -dazu -dem -den -denn -der -des -die -dir -doch -dort -durch -ein -eine -einem -einen -einer -eines -euch -für -gegen -im -in -ins -ist -mit -nach -ob -oder -ohne -so -über -um -und -unter -vom -von -vor -zu -zum -zur -a -adesso -ai -al -alla -allo -allora -altre -altri -altro -anche -ancora -avere -aveva -avevano -ben -buono -che -chi -cinque -comprare -con -consecutivi -consecutivo -cosa -cui -da -del -della -dello -dentro -deve -devo -di -doppio -due -e -ecco -fare -fine -fino -fra -gente -giu -ha -hai -hanno -ho -il -indietro -invece -io -la -lavoro -le -lei -lo -loro -lui -lungo -ma -me -meglio -molta -molti -molto -nei -nella -no -noi -nome -nostro -nove -nuovi -nuovo -o -oltre -ora -otto -peggio -pero -persone -piu -poco -primo -promesso -qua -quarto -quasi -quattro -quello -questo -qui -quindi -quinto -rispetto -sara -secondo -sei -sembra -sembrava -senza -sette -sia -siamo -siete -solo -sono -sopra -soprattutto -sotto -stati -stato -stesso -su -subito -sul -sulla -tanto -te -tempo -terzo -tra -tre -triplo -ultimo -un -una -uno -va -vai -voi -volte -vostro -de -la -que -el -en -y -a -los -del -se -las -por -un -para -con -no -una -su -al -es -lo -como -más -pero -sus -le -ya -o -fue -este -ha -sé -porque -esta -son -está -cuando -muy -sin -sobre -ser -tiene -también -me -hasta -hay -donde -han -quien -están -estado -desde -todo -nos -durante -estados -todos -uno -les -ni -contra -otros -fueron -ese -eso -había -ante -ellos -e -esto -mí -antes -algunos -qué -unos -yo -otro -otras -otra -tanto -esa -estos -mucho -quienes -nada -muchos -cual -sea -poco -ella -estar -haber -estas -estaba -estamos -algunas -algo -nosotros diff --git a/invenio/ext/elasticsearch/es_config.py b/invenio/ext/elasticsearch/es_config.py deleted file mode 100644 index 4b6cc41dd..000000000 --- a/invenio/ext/elasticsearch/es_config.py +++ /dev/null @@ -1,102 +0,0 @@ -# -*- coding: utf-8 -*- -# -# This file is part of Invenio. -# Copyright (C) 2014 CERN. -# -# Invenio is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of the -# License, or (at your option) any later version. -# -# Invenio is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio; if not, write to the Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -"""General config file for ES index.""" - - -################ Fields ############### - -def get_collections_fields_config(): - """Mapping for collections.""" - mapping = { - "recid": {"type": "integer"}, - "name": { - "type": "string", - "analyzer": "keyword"}} - - return mapping - - -def get_documents_fields_config(): - """Mapping for documents.""" - mapping = { - #force recid type to integer for default sorting - "recid": {"type": "integer"}, - "fulltext": {"type": "string"}, - } - return mapping - - -def get_records_fields_config(): - """Mapping for records.""" - from invenio.modules.jsonalchemy.parser import FieldParser - fields = FieldParser.field_definitions('recordext') - mapping = {} - for name, value in fields.iteritems(): - current_mapping = value.get("elasticsearch", {}).get("mapping") - if current_mapping: - mapping.update(current_mapping) - return mapping - - -################ Highlights ############### - -HIGHLIGHTS_BASE_CONFIG = { - "number_of_fragments": 3, - "fragment_size": 70 -} - - -def get_records_highlights_config(): - """Get hilights config for records.""" - from invenio.modules.jsonalchemy.parser import FieldParser - fields = FieldParser.field_definitions('recordext') - highlights = {} - for name, value in fields.iteritems(): - current_highlights = value.get("elasticsearch", {}).get("highlights") - if current_highlights: - highlights.update(current_highlights) - config = { - "fields": highlights - } - return config - - -def get_documents_highlights_config(): - """Get hilights config for fulltext document.""" - config = { - "fields": { - "fulltext": HIGHLIGHTS_BASE_CONFIG - } - } - return config - - -################ Facets ############### - -def get_records_facets_config(): - """Get facets config for records.""" - from invenio.modules.jsonalchemy.parser import FieldParser - fields = FieldParser.field_definitions('recordext') - facets = {} - for name, value in fields.iteritems(): - current_facet = value.get("elasticsearch", {}).get("facets") - if current_facet: - facets.update(current_facet) - return facets diff --git a/invenio/ext/elasticsearch/es_query.py b/invenio/ext/elasticsearch/es_query.py deleted file mode 100644 index e0f4c2871..000000000 --- a/invenio/ext/elasticsearch/es_query.py +++ /dev/null @@ -1,140 +0,0 @@ -# -*- coding: utf-8 -*- -# -# This file is part of Invenio. -# Copyright (C) 2014 CERN. -# -# Invenio is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of the -# License, or (at your option) any later version. -# -# Invenio is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio; if not, write to the Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -"""A warpper between Invenio and elasticsearch. - -invenio.ext.elasticsearch.es_query ----------------------------------- - -A warpper between Invenio and elasticsearch. - -usage: - >>> from es_query import process_es_query, process_es_results - >>> es = ElasticSearch(app) - >>> es.query_handler(process_es_query) - >>> es.results_handler(process_es_results) - -""" -from UserDict import UserDict - - -def process_es_query(query): - """Convert an Invenio query into an ES query. - - :param query: [string] Invenio query - - :return: [dict] ES query - """ - es_query = { - "query": { - "bool": { - "should": [{ - "query_string": { - "query": query - } - }, - { - "has_child": { - "type": "documents", - "query": { - "query_string": { - "default_field": "fulltext", - "query": query - } - } - } - }], - "minimum_should_match": 1 - } - } - } - return es_query - - -def process_es_results(results): - """Convert a ES results into a Invenio search engine result. - - :param results: [object] elasticsearch results - - :return: [object] standard Invenio search engine results - """ - return Response(results) - - -class Response(object): - - """An Invenio response object. - - Contains, Hits, Facet results and Higlights. - """ - - def __init__(self, data): - """New Response instance.""" - self.data = data - self.hits = Hits(data) - self.facets = Facets(data) - self.highlights = Highlights(data) - - -class Hits(object): - - """Iterator over all recids that matched the query.""" - - def __init__(self, data): - """New Hits instance.""" - self.data = data.get("hits") - - def __iter__(self): - """Iteration over values. - - TODO: query with token if you ask for more then len(self) - """ - for hit in self.data['hits']: - yield int(hit['_id']) - - def __len__(self): - """Number of elements.""" - return self.data['total'] - - -class Facets(UserDict): - - """Facet response objects.""" - - def __init__(self, data): - """New Facets instance.""" - UserDict.__init__(self, data.get("aggregations")) - - -class Highlights(UserDict): - - """Hightlights response objects.""" - - def __init__(self, data): - """New Hightlights instance. - - TODO: add fulltext highlights. - """ - new_data = {} - for hit in data.get('hits', {}).get('hits', []): - if hit.get("highlight"): - new_data[int(hit.get('_id'))] = hit.get("highlight") - else: - new_data[int(hit.get('_id'))] = {} - UserDict.__init__(self, new_data) diff --git a/invenio/ext/elasticsearch/jsonext/__init__.py b/invenio/ext/elasticsearch/jsonext/__init__.py deleted file mode 100644 index 5a63afdf1..000000000 --- a/invenio/ext/elasticsearch/jsonext/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# -# This file is part of Invenio. -# Copyright (C) 2013, 2014 CERN. -# -# Invenio is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of the -# License, or (at your option) any later version. -# -# Invenio is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio; if not, write to the Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -"""JsonAchemy parser for elasticsearch Invenio extension.""" diff --git a/invenio/ext/elasticsearch/jsonext/parsers/__init__.py b/invenio/ext/elasticsearch/jsonext/parsers/__init__.py deleted file mode 100644 index 5a63afdf1..000000000 --- a/invenio/ext/elasticsearch/jsonext/parsers/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# -# This file is part of Invenio. -# Copyright (C) 2013, 2014 CERN. -# -# Invenio is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of the -# License, or (at your option) any later version. -# -# Invenio is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio; if not, write to the Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -"""JsonAchemy parser for elasticsearch Invenio extension.""" diff --git a/invenio/ext/elasticsearch/jsonext/parsers/elasticsearch_parser.py b/invenio/ext/elasticsearch/jsonext/parsers/elasticsearch_parser.py deleted file mode 100644 index e5c17d18b..000000000 --- a/invenio/ext/elasticsearch/jsonext/parsers/elasticsearch_parser.py +++ /dev/null @@ -1,126 +0,0 @@ -# -*- coding: utf-8 -*- -# -# This file is part of Invenio. -# Copyright (C) 2014 CERN. -# -# Invenio is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of the -# License, or (at your option) any later version. -# -# Invenio is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio; if not, write to the Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -"""JsonAchemy parser for elasticsearch Invenio extension.""" - -from pyparsing import Keyword, Suppress, Word, Optional, Each, \ - OneOrMore, alphanums, restOfLine - -from invenio.base.utils import try_to_eval -from invenio.modules.jsonalchemy.parser import FieldBaseExtensionParser, \ - indentedBlock, DICT_DEF -from invenio.modules.jsonalchemy.registry import functions - - -class ElasticSearchParser(FieldBaseExtensionParser): - - """ElasticSearch jsonalchemy extension.""" - - __parsername__ = 'elasticsearch' - - @classmethod - def parse_element(cls, indent_stack): - """Parse ``elasticsearch`` section. - - This is an example of the content of this section:: - @extend - title: - elasticsearch: - mapping: { - "properties": { - "title": { - "index_name": "title", - "type": "multi_field", - "fields": { - "title": { - "type": "string", - "analyzer": "standard" - }, - "sort_title": { - "type": "string", - "analyzer": "simple" - } - } - } - } - } - local_tokenizer: - title.title, invenio.ext.elasticsearch.token1 - title.subtitle, invenio.ext.elasticsearch.token2 - facets: { - "authors": { - "terms" : { - "field" : "facet_authors", - "size": 10, - "order" : { "_count" : "desc" } - } - } - } - highlights: { - "number_of_fragments" : 3, - "fragment_size" : 70 - } - - - """ - mapping = (Keyword('mapping:').suppress() + DICT_DEF)\ - .setResultsName('mapping')\ - .setParseAction(lambda toks: toks[0]) - tokenizer_field = Word(alphanums + '_' + '.') - local_tokenizer = (Keyword('local_tokenizer:').suppress() + - indentedBlock( - OneOrMore(tokenizer_field + - Suppress(',') + - restOfLine), indent_stack) - ).setResultsName('local_tokenizer') - facets = (Keyword('facets:').suppress() + DICT_DEF)\ - .setResultsName('facets')\ - .setParseAction(lambda toks: toks[0]) - - highlights = (Keyword('highlights:').suppress() + DICT_DEF)\ - .setResultsName('highlights')\ - .setParseAction(lambda toks: toks[0]) - - return (Keyword('elasticsearch:').suppress() + - indentedBlock( - Each([Optional(mapping), - Optional(local_tokenizer), - Optional(facets), - Optional(highlights)]), - indent_stack) - ).setResultsName('elasticsearch') - - @classmethod - def create_element(cls, rule, namespace): - """Create the dictionary with the info from the configuratin file.""" - element = dict() - element['mapping'] = try_to_eval( - rule.mapping, functions(namespace)) if rule.mapping else {} - element['facets'] = try_to_eval( - rule.facets, functions(namespace)) if rule.facets else {} - element['highlights'] = try_to_eval( - rule.highlights, functions(namespace)) if rule.highlights else {} - element['local_tokenizer'] = [] - if rule.local_tokenizer: - for i in range(0, len(rule.local_tokenizer), 2): - element['local_tokenizer'].append( - (rule.local_tokenizer[i], rule.local_tokenizer[i + 1])) - return element - -parser = ElasticSearchParser diff --git a/invenio/ext/elasticsearch/recordext/__init__.py b/invenio/ext/elasticsearch/recordext/__init__.py deleted file mode 100644 index cdc2b149d..000000000 --- a/invenio/ext/elasticsearch/recordext/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# -# This file is part of Invenio. -# Copyright (C) 2013, 2014 CERN. -# -# Invenio is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of the -# License, or (at your option) any later version. -# -# Invenio is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio; if not, write to the Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -"""Basic JsonAlchemy records configuration.""" diff --git a/invenio/ext/elasticsearch/recordext/fields/elasticsearhext.cfg b/invenio/ext/elasticsearch/recordext/fields/elasticsearhext.cfg deleted file mode 100644 index 5e6cb1803..000000000 --- a/invenio/ext/elasticsearch/recordext/fields/elasticsearhext.cfg +++ /dev/null @@ -1,31 +0,0 @@ -@extend -recid: - elasticsearch: - mapping: { - "recid": { - "type": "integer" - } - } - -@extend -modification_date: - elasticsearch: - mapping: { - "modification_date": { - "type": "date" - } - } - -@extend -creation_date: - elasticsearch: - mapping: { - "modification_date": { - "type": "date" - } - } - -invenio_collections: - calculated: - invenio.ext.elasticsearch.get_col(self.get("recid")) - diff --git a/invenio/ext/elasticsearch/scripts/test_es.py b/invenio/ext/elasticsearch/scripts/test_es.py deleted file mode 100644 index 63e3177d1..000000000 --- a/invenio/ext/elasticsearch/scripts/test_es.py +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# This file is part of Invenio. -# Copyright (C) 2014 CERN. -# -# Invenio is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of the -# License, or (at your option) any later version. -# -# Invenio is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio; if not, write to the Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. -# -*- coding: utf-8 -*- - -"""Bad test script.""" - -#---------------------------- Modules ----------------------------------------- - -# import of standard modules -from __future__ import print_function -from optparse import OptionParser - -# third party modules - - -#---------------------------- Main Part --------------------------------------- - -if __name__ == '__main__': - - usage = "usage: %prog [options]" - - parser = OptionParser(usage) - - parser.set_description(""" - This script is done to test elasticserarch for Invenio. - It should be replaced by unittests. - Bascally it create a new index, index 100 records and perform a search - query. - """) - - (options, args) = parser.parse_args() - from invenio.base.factory import create_app - current_app = create_app() - - with current_app.test_request_context(): - print ("-- Connect to the ES server --") - es = current_app.extensions.get("elasticsearch") - - print("-- Delete old index --") - es.delete_index() - - print("-- Create the index --") - es.create_index() - - print("-- Index records --") - es.index_records(range(1, 100), bulk_size=10) - - print("-- Index documents --") - es.index_documents(range(1, 100), bulk_size=10) - - print("-- Index collections --") - es.index_collections(range(1, 100), bulk_size=1000) - - print("-- Perform search --") - #res = es.search(query="fulltext:test") - res = es.search(query="title:Sneutrinos", - facet_filters=[("facet_authors", "Schael, S"), - ("facet_authors", "Bruneliere, R")]) - - print("Hits:") - print ([hit for hit in res.hits]) - - import json - print("Facets:") - print(json.dumps(res.facets.data, indent=2)) - - print("Highlights:") - print(json.dumps(res.highlights.data, indent=2)) diff --git a/invenio/ext/elasticsearch/tasks.py b/invenio/ext/elasticsearch/tasks.py deleted file mode 100644 index 992c08e77..000000000 --- a/invenio/ext/elasticsearch/tasks.py +++ /dev/null @@ -1,40 +0,0 @@ -# -*- coding: utf-8 -*- -# -# This file is part of Invenio. -# Copyright (C) 2014 CERN. -# -# Invenio is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License as -# published by the Free Software Foundation; either version 2 of the -# License, or (at your option) any later version. -# -# Invenio is distributed in the hope that it will be useful, but -# WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -# General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio; if not, write to the Free Software Foundation, Inc., -# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -"""ES function to submit Celery tasks.""" - -from invenio.celery import celery - - -@celery.task -def index_records(sender, recid): - """Celery function to index records.""" - from flask import current_app - current_app.extensions.get("elasticsearch").index_records([recid]) - #TODO: get_text seems async should be replaced by a signal? - import time - time.sleep(1) - current_app.extensions.get("elasticsearch").index_documents([recid]) - - -@celery.task -def index_collections(sender, collections): - """Celery function to index collections.""" - from flask import current_app - current_app.extensions.get("elasticsearch").index_collections() diff --git a/invenio/ext/es.py b/invenio/ext/es.py new file mode 100644 index 000000000..20d089771 --- /dev/null +++ b/invenio/ext/es.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Invenio. +# Copyright (C) 2015 CERN. +# +# Invenio is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation; either version 2 of the +# License, or (at your option) any later version. +# +# Invenio is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Invenio; if not, write to the Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. + +"""Simplified Elastic Search integration.""" + +from __future__ import absolute_import + +from elasticsearch import Elasticsearch + +from invenio.celery import celery + +es = Elasticsearch() + + +@celery.task +def index_record(recid): + from invenio_records.models import RecordMetadata + record = RecordMetadata.query.get(recid) + es.index( + index='records', + doc_type='record', + body=record.json, + id=record.id + ) + + +def create_index(sender, **kwargs): + es.indices.create(index='records', ignore=400) + + +def delete_index(sender, **kwargs): + es.indices.delete(index='records', ignore=404) + + +def setup_app(app): + """Set up the extension for the given app.""" + from invenio.base import signals + from invenio.base.scripts.database import recreate, drop, create + + from invenio_records.models import RecordMetadata + + from sqlalchemy.event import listens_for + + signals.pre_command.connect(delete_index, sender=drop) + signals.post_command.connect(create_index, sender=create) + signals.pre_command.connect(delete_index, sender=recreate) + signals.post_command.connect(create_index, sender=recreate) + + @listens_for(RecordMetadata, 'after_insert') + @listens_for(RecordMetadata, 'after_update') + def new_record(mapper, connection, target): + index_record.delay(target.id) diff --git a/setup.py b/setup.py index 2cb105b1f..0124e8869 100644 --- a/setup.py +++ b/setup.py @@ -1,337 +1,337 @@ # -*- coding: utf-8 -*- # # This file is part of Invenio. # Copyright (C) 2013, 2014, 2015 CERN. # # Invenio is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # Invenio is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """Invenio is a framework for digital libraries and data repositories. Invenio enables you to run your own digital library or document repository on the web. Invenio covers all aspects of digital library management, from document ingestion, through classification, indexing and further processing, to curation, archiving, and dissemination. The flexibility and performance of Invenio make it a comprehensive solution for management of document repositories of moderate to large sizes (several millions of records). Links ----- * `website `_ * `documentation `_ * `development `_ """ import os import sys from distutils.command.build import build from setuptools import find_packages, setup from setuptools.command.install_lib import install_lib class _build(build): # noqa """Compile catalog before building the package.""" sub_commands = [('compile_catalog', None)] + build.sub_commands class _install_lib(install_lib): # noqa """Custom install_lib command.""" def run(self): """Compile catalog before running installation command.""" install_lib.run(self) self.run_command('compile_catalog') install_requires = [ "alembic>=0.6.6", "Babel>=1.3", "backports.lzma>=0.0.3", "bagit>=1.5.1", "BeautifulSoup>=3.2.1", "BeautifulSoup4>=4.3.2", "celery>=3.1.8", # Cerberus>=0.7.1 api changes and is not yet supported "Cerberus>=0.7,<0.7.1", "chardet>=2.3.0", "datacite>=0.1.0", "dictdiffer>=0.0.3", + "elasticsearch>=1.3.0", "feedparser>=5.1", "fixture>=1.5", "Flask>=0.10.1", "Flask-Admin>=1.1.0", "Flask-Assets>=0.10", "Flask-Babel>=0.9", "Flask-Breadcrumbs>=0.2", "Flask-Cache>=0.12", "Flask-Collect>=1.1.1", "Flask-Email>=1.4.4", "Flask-Gravatar>=0.4.2", "Flask-IIIF>=0.2.0", "Flask-Login>=0.2.7", "Flask-Menu>=0.2", "Flask-OAuthlib>=0.6.0,<0.7", # quick fix for issue #2158 "Flask-Principal>=0.4", "Flask-Registry>=0.2", "Flask-RESTful>=0.2.12", "Flask-Script>=2.0.5", # Development version is used, will switch to >=2.0 once released. "Flask-SQLAlchemy>=2.0", "Flask-WTF>=0.10.2", "cryptography>=0.6", "fs>=0.4", "intbitset>=2.0", "invenio-client>=0.1.0", "invenio-query-parser>=0.2", "itsdangerous>=0.24", "jellyfish>=0.3.2", "Jinja2>=2.7", "jsonschema>=2.4.0", "libmagic>=1.0", "lxml>=3.3", "mechanize>=0.2.5", "mistune>=0.4.1", "msgpack-python>=0.3", "MySQL-python>=1.2.5", "numpy>=1.7", "nydus>=0.10.8", # pyparsing>=2.0.2 has a new api and is not compatible yet "passlib>=1.6.2", "pyparsing>=2.0.1,<2.0.2", "python-twitter>=2.0", "pyPDF>=1.13", "pyPDF2>=1.17", "PyLD>=0.5.2", "pyStemmer>=1.3", "python-dateutil>=1.5", "python-magic>=0.4.6", "pytz>=2014.1", "rauth>=0.7.0", "raven>=5.0.0", "rdflib>=4.1.2", "redis>=2.8.0", "reportlab>=2.7,<3.2", "requests>=2.3,<2.4", "setuptools>=2.2", "six>=1.7.2", "toposort>=1.4", "Sphinx>=1.3", "SQLAlchemy>=1.0", "SQLAlchemy-Utils[encrypted]>=0.30.1", "toposort>=1.4", "unidecode>=0.04.1", "workflow>=1.2.0", "WTForms>=2.0.1", "WTForms-Alchemy>=0.13.1", "WTForms-SQLAlchemy>=0.1", "pyyaml>=3.11", ] extras_require = { "docs": [ "sphinx_rtd_theme>=0.1.7" ], "development": [ "Flask-DebugToolbar==0.9.0", "watchdog==0.8.3", ], "dropbox": [ "dropbox>=2.1.0" ], "elasticsearch": [ - "pyelasticsearch>=0.6.1" ], "googledrive": [ "google-api-python-client>=1.2", "apiclient>=1.0.0", "oauth2client>=1.4.0", "urllib3>=1.8.3" ], "img": [ "qrcode>=5.1", "Pillow>=2.7.0" ], "mongo": [ "pymongo>=3.0" ], "misc": [ # was requirements-extras "gnuplot-py==1.8", "flake8>=2.0.0", # extra=kwalitee? "pychecker==0.8.19", # extra=kwalitee? "pylint>=1.4.0", # extra=kwalitee? "nosexcover>=1.0.0", # test? "python-onedrive>=15.0.0", # extra=cloud? "python-openid>=2.2.0", # extra=sso? ], "mixer": [ "mixer>=5.1.0", ], "sso": [ "Flask-SSO>=0.2" ], "postgresql": [ "psycopg2>=2.5", ], # Alternative XML parser # # For pyRXP, the version on PyPI many not be the right one. # # $ pip install # > https://www.reportlab.com/ftp/pyRXP-1.16-daily-unix.tar.gz#egg=pyRXP # "pyrxp": [ # Any other versions are not supported. "pyRXP==1.16-daily-unix" ], "rabbitmq": [ "amqp>=1.4.5", ], "github": [ "github3.py>=0.9" ], } extras_require["docs"] += extras_require["elasticsearch"] extras_require["docs"] += extras_require["img"] extras_require["docs"] += extras_require["mongo"] extras_require["docs"] += extras_require["sso"] extras_require["docs"] += extras_require["github"] # FIXME extras_require["docs"] += extras_require["dropbox"] # FIXME extras_require["docs"] += extras_require["googledrive"] tests_require = [ "httpretty>=0.8.4", "Flask-Testing>=0.4.1", "mock>=1.0.0", "nose>=1.3.0", "selenium>=2.45.0", "unittest2>=0.5", ] setup_requires = [ 'Babel>=1.3', ] # Add `tests` dependencies to `extras_require` so that developers # could install test dependencies also with pip: extras_require["tests"] = tests_require # Compatibility with Python 2.6 if sys.version_info < (2, 7): install_requires += [ "argparse>=1.3.0", "importlib>=1.0.0" ] # Get the version string. Cannot be done with import! g = {} with open(os.path.join("invenio", "version.py"), "rt") as fp: exec(fp.read(), g) version = g["__version__"] packages = find_packages(exclude=['docs']) packages.append('invenio_docs') setup( name='invenio', version=version, url='https://github.com/inveniosoftware/invenio', license='GPLv2', author='CERN', author_email='info@invenio-software.org', description='Invenio digital library framework', long_description=__doc__, packages=packages, package_dir={'invenio_docs': 'docs'}, include_package_data=True, zip_safe=False, platforms='any', entry_points={ 'console_scripts': [ 'inveniomanage = invenio.base.manage:main', 'plotextractor = invenio.utils.scripts.plotextractor:main', # Legacy 'alertengine = invenio.legacy.webalert.scripts.alertengine:main', 'bibauthorid = ' ' invenio.legacy.bibauthorid.scripts.bibauthorid:main', 'bibcatalog = invenio.legacy.bibcatalog.scripts.bibcatalog:main', 'bibclassify = invenio.modules.classifier.scripts.classifier:main', 'bibconvert = invenio.legacy.bibconvert.scripts.bibconvert:main', 'bibdocfile = invenio.legacy.bibdocfile.scripts.bibdocfile:main', 'bibedit = invenio.legacy.bibedit.scripts.bibedit:main', 'bibencode = invenio.modules.encoder.scripts.encoder:main', 'bibexport = invenio.legacy.bibexport.scripts.bibexport:main', 'bibmatch = invenio.legacy.bibmatch.scripts.bibmatch:main', 'bibsched = invenio.legacy.bibsched.scripts.bibsched:main', 'bibtaskex = invenio.legacy.bibsched.scripts.bibtaskex:main', 'bibtasklet = invenio.legacy.bibsched.scripts.bibtasklet:main', 'bibtex = invenio.modules.sequencegenerator.scripts.bibtex:main', 'bibupload = invenio.legacy.bibupload.scripts.bibupload:main', 'convert_journals = ' ' invenio.legacy.docextract.scripts.convert_journals:main', 'dbexec = invenio.legacy.miscutil.scripts.dbexec:main', 'dbdump = invenio.legacy.miscutil.scripts.dbdump:main', 'docextract = invenio.legacy.docextract.scripts.docextract:main', 'gotoadmin = invenio.modules.redirector.scripts.redirector:main', 'inveniogc = invenio.legacy.websession.scripts.inveniogc:main', 'oairepositoryupdater = ' ' invenio.legacy.oairepository.scripts.oairepositoryupdater:main', 'refextract = invenio.legacy.refextract.scripts.refextract:main', 'textmarc2xmlmarc = ' ' invenio.legacy.bibrecord.scripts.textmarc2xmlmarc:main', 'webaccessadmin = ' ' invenio.modules.access.scripts.webaccessadmin:main', 'webauthorprofile = ' ' invenio.legacy.webauthorprofile.scripts.webauthorprofile:main', 'xmlmarc2textmarc = ' ' invenio.legacy.bibrecord.scripts.xmlmarc2textmarc:main', 'xmlmarclint = invenio.legacy.bibrecord.scripts.xmlmarclint:main', ], "distutils.commands": [ "inveniomanage = invenio.base.setuptools:InvenioManageCommand", ] }, setup_requires=setup_requires, install_requires=install_requires, extras_require=extras_require, classifiers=[ 'Development Status :: 4 - Beta', 'Environment :: Web Environment', 'Intended Audience :: Developers', 'License :: OSI Approved :: GNU General Public License v2' ' or later (GPLv2+)', 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Topic :: Internet :: WWW/HTTP :: Dynamic Content', ], test_suite='invenio.testsuite.suite', tests_require=tests_require, cmdclass={ 'build': _build, 'install_lib': _install_lib, }, )