SIB-GL-publications.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, Feb 18, 23:22

SIB-GL-publications.py
View Options

	#!/Users/vioannid/Documents/PYTPUBLIS3/bin/python
	# -- coding: utf-8 --
	"""
	Program to retrieve SIB publications from EuropePMC.org for a given year
	Verifies with SIB group leaders and secondary affiliations

	Created on Thu Dec 22 09:08:41 2016

	@author: vioannid, hstockin
	"""

	### to read character encoded files
	import codecs
	import sys
	import os
	import csv
	from pprint import pprint
	import datetime

	### import xml.etree.ElementTree as ET
	from urllib.request import urlopen
	from xml.etree.ElementTree import parse

	### import regular expressions
	import re

	### used for UTF-8 sorting
	import PyICU
	from functools import cmp_to_key

	### error handling
	import logging
	import traceback

	### HTML validation using HTML tidy
	from tidylib import tidy_document, tidy_fragment

	# Retrieve the 'Other Articles' section from sib.swiss/scientific-publications
	# Save to variable to be used in writeHtml()
	# def getOtherPublications():
	# url = 'http://www.sib.swiss/research/scientific-publications'
	# sock = urlopen(url).read().decode('utf-8')
	# pat = re.compile(r"((<!--OTHER-->.*<!--END-OTHER-->))", re.M \| re.S)
	# global otherPub
	#
	# try:
	# otherPub = pat.search(sock)
	#
	# except:
	# print("Problem with the _Other articles_ sections")
	# sys.exit(1)


	# Retrieve all publications where SIB is mentioned in the author list
	# @param year of publication
	def getPublicationsWithSIBAffilliation(year):
	print("SIB publications:", year)
	sib = 'aff:%22Swiss%20Institute%20of%20Bioinformatics%22'
	date = 'FIRST_PDATE:[' + str(year) + '-01-01%20TO%20' + str(year) + '-12-31]'
	#date = 'FIRST_PDATE:[' + str(year) + '-05-01%20TO%20' + str(year) + '-05-31]'
	page = '&cursorMark=*&pageSize=1000' # pagination default is limited to 25 entries
	query = 'http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=' + sib + date + page
	u = urlopen(query)
	doc = parse(u)
	# resultList/result are xml tags
	lst = doc.findall("resultList/result")
	# hitCount is an xml tag
	entries = doc.findall("hitCount")
	print('Number of entries found = ' + entries[0].text)
	if int(entries[0].text) > 1000:
	print('Error: more than 1000 entries found - need to extend query')
	sys.exit(1)

	# store publications (result) 'id' into file
	fresult = resultDirName + '/' + resultFilePrefix + 'all-SIB-IDs.txt'
	fout = open(fresult,'w+')
	for item in lst:
	ids.append(item.find('id').text)
	print(item.find('id').text, file=fout)
	fout.close()


	# Read IDs from a file and assign to ids()
	def readIDs():
	fresult = resultDirName + '/' + resultFilePrefix + 'all-SIB-IDs.txt'
	fh = open(fresult)
	for line in fh:
	ids.append(line.strip())
	fh.close()


	# Write final set of all IDs (stored in idSet)
	def writeAllIDs():
	resultFile = resultDirName + '/' + resultFilePrefix + 'all-IDs.txt'
	fresult = resultFile
	fout = open(fresult,'w+')
	for item in idSet:
	print(item, file=fout)
	fout.close()
	print('All IDs are written into the following file: ', resultFile)


	# Retrieve publications for each of the GLs using their home university
	# @param year of publication
	def getPublicationsForGroupLeaders(year):
	# import list of GL and affiliation from file
	fname = 'conf/list-of-GLs.csv'
	#fname = 'conf/list-of-GLs-test-3.csv'
	fresult = resultDirName + '/' + resultFilePrefix + 'publications-for-GLs.csv'
	fout = open(fresult,'w+')
	wr = csv.writer(fout, quoting=csv.QUOTE_ALL)

	#adapt the encoding according to the input file
	fh = codecs.open(fname, encoding='ISO-8859-1')

	global idSet
	idSet = set(ids)
	print("Name , SIB, Uni , Paper IDs", file=fout)

	# for each GL, find publications with and without SIB affiliation
	for lines in fh:
	line = lines.strip().split(',')
	#exclude special rows
	if line[0].startswith('#') or line[0].startswith('SIB') or line[0].startswith('"(AUTH'): continue
	auth = '%22' + line[0].replace(" ","%20") + '%22'
	aff1 = '%22Swiss%20Institute%20of%20Bioinformatics%22'
	aff2 = '%22' + line[1].replace(" ","%20") + '%22'
	fpdate = '[' + str(year) + '-01-01%20TO%20' + str(year) + '-12-31]'

	#Download the xml and parse it
	prefix = 'http://www.ebi.ac.uk/europepmc/webservices/rest/search?query='
	#The following lines _do not work_!
	#url1 = prefix + 'AUTH:' + auth + ' AFF:' + aff1 + ' FIRST_PDATE:' + fpdate
	#url1 = prefix + 'auth:' + auth + ' aff:' + aff1 + ' first_pdate:' + fpdate
	#url1 = prefix + 'auth:' + auth + 'aff:' + aff1 + 'first_pdate:' + fpdate

	url1 = prefix + 'AUTH:' + auth + '%20AFF:' + aff1 + '%20FIRST_PDATE:' + fpdate
	url2 = prefix + 'AUTH:' + auth + '%20AFF:' + aff2 + '%20FIRST_PDATE:' + fpdate
	u1 = urlopen(url1)
	u2 = urlopen(url2)
	#print(url1)
	#print(url2)

	#example working url below
	#u = urlopen('http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=AUTH:"Stockinger%20H"AFF:"swiss%20institute%20of%20bioinformatics"FIRST_PDATE:[2016-01-01%20TO%202016-12-31]')
	doc = parse(u1)

	vava = doc.findall("request")
	#for truc in vava:
	#print(truc.find('query').text)




	lst1 = doc.findall("resultList/result")
	ids1 = list()
	for item in lst1:
	ids1.append(item.find('id').text)

	doc2 = parse(u2)
	lst2 = doc2.findall("resultList/result")
	ids2 = list()
	for item in lst2:
	ids2.append(item.find('id').text)

	#print(line[0])
	#print('counts:', len(ids1), '//ids1:', ids1)
	#print('counts:', len(ids2), '//ids2:', ids2)
	un = set(ids1)
	deux = set(ids2)
	#print('counts union:', len(un \| deux), '-> union:', un \| deux)
	#print('counts intersection:', len(un & deux),'-> intersection:', un & deux)
	#print('counts only local:', len(deux - un), '-> only local:', deux - un)
	#print('In ids2 but not in ids:', deux.difference(idSet))
	#print('Len(idSet)' , len(idSet))
	idSet = idSet.union(deux)
	print('Len(idSet)' , len(idSet))
	# exit(2)
	mylist=list()
	mylist.append(line[0])
	mylist.append(len(ids1))
	mylist.append(len(ids2))
	mylist.extend(list(un \| deux))
	wr.writerow(mylist)

	fout.close()

	def validateHtml():
	htmlFile = resultDirName + '/' + resultFilePrefix + 'all-SIB-publications.txt'
	htmlHandle = open(htmlFile, "r")
	htmlString = htmlHandle.read()
	htmlHandle.close()

	tidyoptions={
	"indent": "auto",
	"indent-spaces": 2,
	"wrap": 72,
	"markup": True,
	"output-xml": False,
	"input-xml": False,
	"show-warnings": True,
	"numeric-entities": True,
	"quote-marks": True,
	"quote-nbsp": True,
	"quote-ampersand": False,
	"break-before-br": False,
	"uppercase-tags": False,
	'uppercase-attributes': False
	}

	htmlString, errors = tidy_fragment(htmlString,tidyoptions)
	if errors:
	return False
	return True

	# Retrieve all publications with the following fields:
	# authorString, title, journalTitle, issue, journalVolume, pubYear, journalIssn, pageInfo, doi
	# import list of IDs from results/all-IDs.txt
	def writeHtml():
	#global year
	fallIds = resultDirName + '/' + resultFilePrefix + 'all-IDs.txt'
	fh = open(fallIds)
	extIds = list()
	for line in fh:
	extIds.append(line.strip())
	fh.close()

	# print extIds, len(extIds), len(sorted(set(extIds)))
	query = 'http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:'
	listHtml = []
	# query EPBMC with id and store required fields
	for ext_id in extIds:
	u = urlopen(query + ext_id)
	doc = parse(u)
	# resultList/result are xml tags
	lst = doc.findall("resultList/result")
	for item in lst:
	rA = ''
	rT = ''
	rJT = ''
	rJV = ''
	rPY = ''
	rIssue = ''
	rPI = ''
	rDOIstr = ''
	rDOI = ''
	try:
	if (item.find('id').text == ext_id):
	logger.info("ok: %s", ext_id)
	except:
	print('sent: ' + ext_id + 'received: ' + item.find('id').text)
	sys.exit(1)
	try:
	# cases with more than 6 authors => add the more/less toggle functionality
	authStr = item.find('authorString').text
	auths = authStr.strip().split(',')
	if len(auths) > 6 :
	auths_pre = auths[0:6]
	auths_pre = ','.join(auths_pre)
	auths_post = ['<span class="toggleme">'] + auths[6:]
	auths_post = ','.join(auths_post)
	authsList = [auths_pre, auths_post]
	authsList.extend('</span>')
	rA = ''.join(authsList)
	else:
	rA = item.find('authorString').text
	except:
	rA = ''
	try:
	rT = item.find('title').text
	except:
	rT = ''
	try:
	rJT = item.find('journalTitle').text
	except:
	rJT = ''
	try:
	rPY = item.find('pubYear').text
	except:
	rPY = ''
	try:
	rJV = item.find('journalVolume').text
	if rJV is not None:
	rJV = ';' + rJV
	else:
	rJV = ''
	except:
	rJV = ''
	try:
	rIssue = item.find('issue').text
	if rIssue is not None:
	rIssue = '(' + rIssue + ')'
	else:
	rIssue = ''
	except:
	rIssue = ''
	try:
	rPI = item.find('pageInfo').text
	if rPI is not None:
	rPI = ':' + rPI
	else:
	rPI = ''
	except:
	rPI = ''
	try:
	rDOIstr = item.find('doi').text
	rDOI = ' <a href="https://doi.org/' + rDOIstr + '" target="_blank">'
	# logger.warning('DOI ok: %s \| %s \| %s', rDOIstr, rDOI, ext_id)
	except Exception:
	# print("DOI not found, using pmid: " + ext_id)
	logger.warning('DOI problem before: %s - %s - %s', rDOIstr, rDOI, ext_id)
	rDOIstr = ext_id
	rDOI = ' <a href="http://europepmc.org/search?query=' + rDOIstr + '" target="_blank">'
	logger.warning('DOI problem after: %s - %s - %s', rDOIstr, rDOI, ext_id)
	listHtml.append(rA + rDOI + rT + "</a>" + " <em>" + rJT + "</em>"+ " " + rPY + rJV + rIssue + rPI)
	# sort a UTF-8 list case insensitive
	collator = PyICU.Collator.createInstance(PyICU.Locale('en_GB.UTF-8'))
	listHtmlSorted = sorted(listHtml, key=cmp_to_key(collator.compare))
	# store publications html info into file
	fhtmlResult = resultDirName + '/' + resultFilePrefix + 'all-SIB-publications.txt'
	fout = open(fhtmlResult,'w+')
	print("<h1>Publications by SIB Members published in ",year," </h1>", file=fout)
	from datetime import datetime
	print("<h6>Listed in <a href='https://europepmc.org' target='_blank'>Europe PMC</a>, <i>latest update: " + datetime.now().strftime('%Y-%m-%d') + "</i>. </h6>", file=fout)
	print("<h2>Peer-reviewed articles and conference proceedings</h2>", file=fout)
	print('<p>Search <input type="text"></p>',file=fout)
	print("<ol id=\"sibpublis\">", file=fout)
	for line in listHtmlSorted:
	print("<li class=\"sibpubli\">" + line + "</li>", file=fout)
	print("</ol>", file=fout)
	# add the 'other publications' section
	#print(str(otherPub[0]), file=fout)
	fout.close()

	#
	# Main program
	#
	if __name__ == "__main__":
	ids = list()
	idSet = set()
	os.chdir(os.path.dirname(os.path.realpath(__file__)))
	otherPub = None
	today = str(datetime.date.today()).replace('-','')
	FORMAT = '%(asctime)-15s %(clientip)s %(user)-8s %(message)s'
	# logging.basicConfig(format=FORMAT)
	logger = logging.getLogger('main')
	logger.setLevel(logging.DEBUG)

	# argv[1] is optional parameter for year
	if len(sys.argv) >= 2:
	try:
	year = int(sys.argv[1])
	print(year)
	if year < 1998 or year > 2020:
	sys.exit(1)
	except:
	print("Year must be an integer between 1998 and 2020")
	print(sys.argv[0], sys.argv[1])
	sys.exit(1)
	else:
	year = int(today[:4]);

	resultDirName = 'results-' + str(year) + '_' + today
	resultFilePrefix = str(year) + '_' + today + '-'
	if not os.path.exists(resultDirName):
	os.mkdir(resultDirName)
	#getOtherPublications()
	getPublicationsWithSIBAffilliation(year)
	readIDs()
	getPublicationsForGroupLeaders(year)
	print('\nNumber of publications obtained:', len(idSet))
	writeAllIDs()
	writeHtml()
	if validateHtml() is False:
	print ("Malformatted HTML file")
	sys.exit(1)
	else:
	print ("Valid HTML")
	sys.exit(0)

SIB-GL-publications.pyNo OneTemporaryActions

File Metadata

SIB-GL-publications.pyView Options

Event Timeline

SIB-GL-publications.py
No OneTemporary
Actions

SIB-GL-publications.py
View Options