SIB-monthly-publications.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Sat, Jan 25, 23:28

SIB-monthly-publications.py
View Options

	#!/Users/vioannid/Documents/PYTPUBLIS3/bin/python
	# -- coding: utf-8 --
	"""
	Program to retrieve SIB publications from EuropePMC.org for a given month
	Created on Mon Jun 26 11:22:00 2017
	@author: vioannid, hstockin
	"""

	### to read character encoded files
	import codecs
	import sys
	import os
	import csv
	from pprint import pprint
	import datetime
	import calendar

	### import xml.etree.ElementTree as ET
	from urllib.request import urlopen
	from xml.etree.ElementTree import parse

	### import regular expressions
	import re

	### used for UTF-8 sorting
	import PyICU
	from functools import cmp_to_key

	### error handling
	import logging
	import traceback

	# Retrieve all publications where SIB is mentioned as affiliation in the author list
	# @param year,month,month_nb_days of publication
	def getPublicationsWithSIBAffilliation(year,month,month_nb_days):
	print("SIB publications:", year, month, month_nb_days)
	sib = 'aff:%22Swiss%20Institute%20of%20Bioinformatics%22'
	date = 'FIRST_PDATE:[' + str(year) + '-' + str(month).zfill(2) + '-01%20TO%20' + str(year) + '-' + str(month).zfill(2) + '-' + str(month_nb_days)+ ']'
	#date = 'FIRST_PDATE:[' + str(year) + '-05-01%20TO%20' + str(year) + '-05-31]'
	page = '&cursorMark=*&pageSize=1000' # pagination default is limited to 25 entries
	query = 'http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=' + sib + date + page

	print(query)

	u = urlopen(query)
	doc = parse(u)
	# resultList/result are xml tags
	lst = doc.findall("resultList/result")
	# hitCount is an xml tag
	entries = doc.findall("hitCount")
	print('Number of entries found = ' + entries[0].text)
	if int(entries[0].text) > 1000:
	print('Error: more than 1000 entries found - need to extend query')
	sys.exit(1)

	# store publications (result) 'id' into file
	fresult = resultDirName + '/' + resultFilePrefix + 'all-SIB-IDs.txt'
	fout = open(fresult,'w+')
	for item in lst:
	ids.append(item.find('id').text)
	print(item.find('id').text, file=fout)
	fout.close()


	# Read IDs from a file and assign to ids()
	def readIDs():
	fresult = resultDirName + '/' + resultFilePrefix + 'all-SIB-IDs.txt'
	fh = open(fresult)
	for line in fh:
	ids.append(line.strip())
	fh.close()


	# Retrieve all publications with the following fields:
	# authorString, title, journalTitle, issue, journalVolume, pubYear, journalIssn, pageInfo, doi
	# import list of IDs from results/all-IDs.txt
	def writeHtml():
	fallIds = resultDirName + '/' + resultFilePrefix + 'all-SIB-IDs.txt'
	fh = open(fallIds)
	extIds = list()
	for line in fh:
	extIds.append(line.strip())
	fh.close()

	# print extIds, len(extIds), len(sorted(set(extIds)))
	query = 'http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:'
	listHtml = []
	# query EPBMC with id and store required fields
	for ext_id in extIds:
	u = urlopen(query + ext_id)
	doc = parse(u)
	# resultList/result are xml tags
	lst = doc.findall("resultList/result")
	for item in lst:
	rA = ''
	rT = ''
	rJT = ''
	rJV = ''
	rPY = ''
	rIssue = ''
	rPI = ''
	rDOIstr = ''
	rDOI = ''
	try:
	if (item.find('id').text == ext_id):
	logger.info("ok: %s", ext_id)
	except:
	print('sent: ' + ext_id + 'received: ' + item.find('id').text)
	sys.exit(1)
	try:
	# cases with more than 6 authors => add the more/less toggle functionality
	authStr = item.find('authorString').text
	auths = authStr.strip().split(',')
	if len(auths) > 6 :
	auths_pre = auths[0:6]
	auths_pre = ','.join(auths_pre)
	auths_post = ['<span class="toggleme">'] + auths[6:]
	auths_post = ','.join(auths_post)
	authsList = [auths_pre, auths_post]
	authsList.extend('</span>')
	rA = ''.join(authsList)
	else:
	rA = item.find('authorString').text
	except:
	rA = ''
	try:
	rT = item.find('title').text
	except:
	rT = ''
	try:
	rJT = item.find('journalTitle').text
	except:
	rJT = ''
	try:
	rPY = item.find('pubYear').text
	except:
	rPY = ''
	try:
	rJV = item.find('journalVolume').text
	if rJV is not None:
	rJV = ';' + rJV
	else:
	rJV = ''
	except:
	rJV = ''
	try:
	rIssue = item.find('issue').text
	if rIssue is not None:
	rIssue = '(' + rIssue + ')'
	else:
	rIssue = ''
	except:
	rIssue = ''
	try:
	rPI = item.find('pageInfo').text
	if rPI is not None:
	rPI = ':' + rPI
	else:
	rPI = ''
	except:
	rPI = ''
	try:
	rDOIstr = item.find('doi').text
	rDOI = ' <a href="https://doi.org/' + rDOIstr + '" target="_blank">'
	# logger.warning('DOI ok: %s \| %s \| %s', rDOIstr, rDOI, ext_id)
	except Exception:
	# print("DOI not found, using pmid: " + ext_id)
	logger.warning('DOI problem before: %s - %s - %s', rDOIstr, rDOI, ext_id)
	rDOIstr = ext_id
	rDOI = ' <a href="http://europepmc.org/search?query=' + rDOIstr + '" target="_blank">'
	logger.warning('DOI problem after: %s - %s - %s', rDOIstr, rDOI, ext_id)
	listHtml.append(rA + rDOI + rT + "</a>" + " <em>" + rJT + "</em>"+ " " + rPY + rJV + rIssue + rPI)
	# sort a UTF-8 list case insensitive
	collator = PyICU.Collator.createInstance(PyICU.Locale('en_GB.UTF-8'))
	listHtmlSorted = sorted(listHtml, key=cmp_to_key(collator.compare))
	# store publications html info into file
	fhtmlResult = resultDirName + '/' + resultFilePrefix + 'all-SIB-publications.txt'
	fout = open(fhtmlResult,'w+')
	print("<h1>Publications by SIB Members published in ",month," </h1>", file=fout)
	from datetime import datetime
	print("<h6>Listed in <a href='https://europepmc.org' target='_blank'>Europe PMC</a>, <i>latest update: " + datetime.now().strftime('%Y-%m-%d') + "</i>. </h6>", file=fout)
	print("<h2>Peer-reviewed articles and conference proceedings</h2>", file=fout)
	print("<ol id=\"sibpublis\">", file=fout)
	for line in listHtmlSorted:
	print("<li class=\"sibpubli\">" + line + "</li>", file=fout)
	print("</ol>", file=fout)
	fout.close()

	#
	# Main program
	#
	if __name__ == "__main__":
	ids = list()
	idSet = set()
	os.chdir(os.path.dirname(os.path.realpath(__file__)))
	otherPub = None
	FORMAT = '%(asctime)-15s %(clientip)s %(user)-8s %(message)s'
	# logging.basicConfig(format=FORMAT)
	logger = logging.getLogger('main')
	logger.setLevel(logging.DEBUG)
	year=''
	month=''
	month_nb_days=''
	today = str(datetime.date.today())
	today_time_stamp = str(today).replace('-','')

	# argv[1] is optional parameter for month
	if len(sys.argv) >= 3:
	try:
	year = int(sys.argv[1])
	if year < 1998 or year > 2020:
	sys.exit(1)
	except:
	print("year must be an integer between 1998 and 2020")
	print(sys.argv[0], sys.argv[1], sys.argv[2])
	sys.exit(1)
	try:
	month = int(sys.argv[2])
	month_nb_days = calendar.monthrange(year, month)[1]
	if month < 1 or month > 12:
	sys.exit(1)
	except:
	print('month must be an integer between 1 and 12')

	else:
	year = int(today[:4]);
	# previous month by default
	month = int(today[5:7]) - 1;
	month_nb_days = calendar.monthrange(year, month)[1]

	date_filename = datetime.datetime.strptime(today, "%Y-%m-%d")
	resultDirName = 'results_' + str(year) + '-' + str(month).zfill(2) + '_' + today_time_stamp
	resultFilePrefix = str(year) + '-' + str(month).zfill(2) + '_' + today_time_stamp + '-'
	if not os.path.exists(resultDirName):
	os.mkdir(resultDirName)
	print ('Requested year:',year, ',month: ',month, '(',month_nb_days,'day-s). Timestamp: ', today_time_stamp)
	getPublicationsWithSIBAffilliation(year,month,month_nb_days)
	readIDs()
	writeHtml()
	sys.exit(0)

SIB-monthly-publications.pyNo OneTemporaryActions

File Metadata

SIB-monthly-publications.pyView Options

Event Timeline

SIB-monthly-publications.py
No OneTemporary
Actions

SIB-monthly-publications.py
View Options