diff --git a/modules/bibexport/lib/google_scholar_exporter.py b/modules/bibexport/lib/google_scholar_exporter.py deleted file mode 100644 index 0aa0d3e2d..000000000 --- a/modules/bibexport/lib/google_scholar_exporter.py +++ /dev/null @@ -1,271 +0,0 @@ -# -*- coding: utf-8 -*- -## -## $Id$ -## -## This file is part of CDS Invenio. -## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. -## -## CDS Invenio is free software; you can redistribute it and/or -## modify it under the terms of the GNU General Public License as -## published by the Free Software Foundation; either version 2 of the -## License, or (at your option) any later version. -## -## CDS Invenio is distributed in the hope that it will be useful, but -## WITHOUT ANY WARRANTY; without even the implied warranty of -## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -## General Public License for more details. -## -## You should have received a copy of the GNU General Public License -## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., -## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. - -""" -Implementation of Google Scholar exporting. - -The Google Scholar exporting method answers this use case: every first -of the month, please export all records modified during the last month -and matching these search criteria in an NLM format in such a way that -the output is split into files containing not more than 1000 records -and compressed via gzip and placed in this place from where Google -Scholar would fetch them. The output files would be organized like -this: - -* all exportable records: - - /export/google-scholar/all-index.nlm.html - links to parts below - /export/google-scholar/all-part1.nlm.xml.gz - first batch of 1000 records - /export/google-scholar/all-part2.nlm.xml.gz - second batch of 1000 records - ... - /export/google-scholar/all-partM.nlm.xml.gz - last batch of 1000 records - -* records modified in the last month: - - /export/google-scholar/lastmonth-index.nlm.html - links to parts below - /export/google-scholar/lastmonth-part1.nlm.xml.gz - first batch of 1000 records - /export/google-scholar/lastmonth-part2.nlm.xml.gz - second batch of 1000 records - ... - /export/google-scholar/lastmonth-partN.nlm.xml.gz - last batch of 1000 records -""" - -from invenio.search_engine import perform_request_search, print_record -import os -import gzip -import datetime - -class GoogleScholarExporter: - """Export data for google scholar""" - - _output_directory = "" - _records_with_fulltext_only = True - #FIXME: Read collections from configuration file - _collections = ["Theses"] - - def __init__(self, output_directory): - """Constructor of GoogleScholarExporter - - output_directory - directory where files will be placed - """ - - self.set_output_directory(output_directory) - - def export(self): - """Export all records and records modified last month""" - LAST_MONTH_FILE_NAME_PATTERN = "lastmonth" - ALL_MONTH_FILE_NAME_PATTERN = "all" - SPLIT_BY_RECORDS = 3 - - # Export records modified last month - records = self._get_records_modified_last_month() - self._delete_files(self._output_directory, LAST_MONTH_FILE_NAME_PATTERN) - self._split_records_into_files(records, SPLIT_BY_RECORDS, LAST_MONTH_FILE_NAME_PATTERN, self._output_directory) - - # Export all records - all_records = self._get_all_records() - self._delete_files(self._output_directory, ALL_MONTH_FILE_NAME_PATTERN) - self._split_records_into_files(all_records, SPLIT_BY_RECORDS, ALL_MONTH_FILE_NAME_PATTERN, self._output_directory) - - def set_output_directory(self, path_to_directory): - """Check if directory exists. If it does not exists it creates it.""" - - directory = path_to_directory - # remove the slash from the end of the path if exists - if directory[-1] == os.sep: - directory = directory[:-1] - - # if directory does not exists then create it - if not os.path.exists(directory): - try: - os.makedirs(directory) - except(IOError, OSError), exception: - self._report_error("Directory %s does not exist and cannot be ctreated." % (directory, ), exception) - - # if it is not path to a directory report an error - if not os.path.isdir(directory): - self._report_error("%s is not a directory." % (directory, )) - return - - self._output_directory = directory - - def _get_records_modified_last_month(self): - """Returns all records modified last month and matching the criteria.""" - current_date = datetime.date.today() - one_month_ago = current_date - datetime.timedelta(days = 31) - - #FIXME: Return only records with full texts available for Google Scholar - #FIXME: There is a problem with searching in modification date. It searches only in creation date - return perform_request_search(dt="m", c = self._collections, d1y = one_month_ago.year, d1m = one_month_ago.month, d1d = one_month_ago.day) - - def _get_all_records(self): - """Return all records matching the criteria no matter of their modification date.""" - #FIXME: Return only records with full texts available for Google Scholar - return perform_request_search(c = self._collections) - - def _split_records_into_files(self, records, max_records_per_file, file_name_pattern, output_directory): - """Split and save records into files containing not more than max_records_per_file records. - - records - list of record numbers - - max_records_per_file - the maximum number of records per file - - file_name_pattern - the pattern used to name the files. Filenames will start with this - pattern. - - output_directory - directory where all the files will be placed - """ - file_number = 1 - file_name = self._get_part_file_name(file_name_pattern, file_number) - begin = 0 - number_of_records = len(records) - - if 0 == number_of_records: - return - - for end in xrange(max_records_per_file, number_of_records, max_records_per_file): - self._save_records_into_file(records[begin:end], file_name, output_directory) - begin = end - file_number = file_number + 1 - file_name = self._get_part_file_name(file_name_pattern, file_number) - - if(begin != number_of_records): - self._save_records_into_file(records[begin:number_of_records], file_name, output_directory) - - self._create_index_file(file_number, file_name_pattern, output_directory) - - def _get_part_file_name(self, file_name_pattern, file_number): - """Returns name of the file containing part of the records - - file_name_pattern - the pattetn used to create the filename - - file_number - the number of the file in the sequence of files - - The result is filename like lastmonth-part2.nlm.xml.gz - where lastmonth is the file_name_pattern and 2 is the file_number - """ - file_name = "%s-part%d.nlm.xml.gz" % (file_name_pattern, file_number) - - return file_name - - def _create_index_file(self, number_of_files, file_name_pattern, output_directory): - """Creates HTML file containing links to all files containing records""" - - try: - index_file = open(output_directory + os.sep +file_name_pattern+"-index.nlm.html", "w") - index_file.write("<html><body>") - - for file_number in xrange(1, number_of_files): - file_name = self._get_part_file_name(file_name_pattern, file_number) - index_file.write('<a href="%s">%s</a><br>' % (file_name, file_name)) - - index_file.write("</body></html>") - except (IOError, OSError), exception: - self._report_error("Failed to create index file.", exception) - - if index_file is not None: - index_file.close() - - def _save_records_into_file(self, records, file_name, output_directory): - """Save all the records into file in proper format (currently - National Library of Medicine XML). - - file_name - the name of the file where records will be saved - - output_directory - directory where the file will be placed""" - - output_file = self._open_output_file(file_name, output_directory) - self._write_to_output_file(output_file, "<articles>\n") - - for record in records: - nlm_xml = self._get_record_NLM_XML(record) - output_file.write(nlm_xml) - - self._write_to_output_file(output_file, "\n</articles>") - self._close_output_file(output_file) - - def _open_output_file(self, file_name, output_directory): - """Opens new file for writing. - - file_name - the name of the file without the extention. - - output_directory - the directory where file will be created""" - - path = output_directory + os.sep + file_name - - try: - output_file = gzip.GzipFile(filename = path, mode = "w") - return output_file - except (IOError, OSError), exception: - self._report_error("Failed to open file file %s." % (path, ), exception) - return None - - def _close_output_file(self, output_file): - """Closes the file""" - if output_file is None: - return - output_file.close() - - def _write_to_output_file(self, output_file, text_to_write): - """"Wirtes a the text passed as a parameter to file""" - try: - output_file.write(text_to_write) - except (IOError, OSError), exception: - self._report_error("Failed to write to file " + output_file.name, exception) - - def _get_record_NLM_XML(self, record): - """Returns the record in National Library of Medicine XML format.""" - return print_record(record, format='xn') - - def _delete_files(self, path_to_directory, name_pattern): - """Deletes files with file name starting with name_pattern - from directory specified by path_to_directory""" - - files = os.listdir(path_to_directory) - - for current_file in files: - if current_file.startswith(name_pattern): - path_to_file = path_to_directory + os.sep + current_file - os.remove(path_to_file) - - def _report_error(self, error_message, exception = None): - """Reprts an error during exprotring""" - raise GoogleScholarExportException(error_message, exception) - -class GoogleScholarExportException(Exception): - """Exception indicating an error during exportting for Google scholar.""" - - _error_message = "" - _inner_exception = None - - def __init__(self, error_message, inner_exception = None): - """Constructor of the exception""" - Exception.__init__(self, error_message, inner_exception) - - self._error_message = error_message - self._inner_exception = inner_exception - - def get_error_message(self): - """Returns the error message that explains the reason for the exception""" - return self._error_message - - def get_inner_exception(self): - """Returns the inner exception that is the cause for the current exception""" - return self._inner_exception \ No newline at end of file