Page MenuHomec4science

SIB-GL-publications.py
No OneTemporary

File Metadata

Created
Wed, Nov 6, 07:44

SIB-GL-publications.py

#!/Users/vioannid/Documents/PYTPUBLIS3/bin/python
# -*- coding: utf-8 -*-
"""
Program to retrieve SIB publications from EuropePMC.org for a given year
Verifies with SIB group leaders and secondary affiliations
Created on Thu Dec 22 09:08:41 2016
@author: vioannid, hstockin
"""
### to read character encoded files
import codecs
import sys
import os
import csv
from pprint import pprint
import datetime
### import xml.etree.ElementTree as ET
from urllib.request import urlopen
from xml.etree.ElementTree import parse
### import regular expressions
import re
### used for UTF-8 sorting
import PyICU
from functools import cmp_to_key
### error handling
import logging
import traceback
### HTML validation using HTML tidy
from tidylib import tidy_document, tidy_fragment
# Retrieve the 'Other Articles' section from sib.swiss/scientific-publications
# Save to variable to be used in writeHtml()
# def getOtherPublications():
# url = 'http://www.sib.swiss/research/scientific-publications'
# sock = urlopen(url).read().decode('utf-8')
# pat = re.compile(r"((<!--OTHER-->.*<!--END-OTHER-->))", re.M | re.S)
# global otherPub
#
# try:
# otherPub = pat.search(sock)
#
# except:
# print("Problem with the _Other articles_ sections")
# sys.exit(1)
# Retrieve all publications where SIB is mentioned in the author list
# @param year of publication
def getPublicationsWithSIBAffilliation(year):
print("SIB publications:", year)
sib = 'aff:%22Swiss%20Institute%20of%20Bioinformatics%22'
date = 'FIRST_PDATE:[' + str(year) + '-01-01%20TO%20' + str(year) + '-12-31]'
#date = 'FIRST_PDATE:[' + str(year) + '-05-01%20TO%20' + str(year) + '-05-31]'
page = '&cursorMark=*&pageSize=1000' # pagination default is limited to 25 entries
query = 'http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=' + sib + date + page
u = urlopen(query)
doc = parse(u)
# resultList/result are xml tags
lst = doc.findall("resultList/result")
# hitCount is an xml tag
entries = doc.findall("hitCount")
print('Number of entries found = ' + entries[0].text)
if int(entries[0].text) > 1000:
print('Error: more than 1000 entries found - need to extend query')
sys.exit(1)
# store publications (result) 'id' into file
fresult = resultDirName + '/' + resultFilePrefix + 'all-SIB-IDs.txt'
fout = open(fresult,'w+')
for item in lst:
ids.append(item.find('id').text)
print(item.find('id').text, file=fout)
fout.close()
# Read IDs from a file and assign to ids()
def readIDs():
fresult = resultDirName + '/' + resultFilePrefix + 'all-SIB-IDs.txt'
fh = open(fresult)
for line in fh:
ids.append(line.strip())
fh.close()
# Write final set of all IDs (stored in idSet)
def writeAllIDs():
resultFile = resultDirName + '/' + resultFilePrefix + 'all-IDs.txt'
fresult = resultFile
fout = open(fresult,'w+')
for item in idSet:
print(item, file=fout)
fout.close()
print('All IDs are written into the following file: ', resultFile)
# Retrieve publications for each of the GLs using their home university
# @param year of publication
def getPublicationsForGroupLeaders(year):
# import list of GL and affiliation from file
fname = 'conf/list-of-GLs.csv'
#fname = 'conf/list-of-GLs-test-3.csv'
fresult = resultDirName + '/' + resultFilePrefix + 'publications-for-GLs.csv'
fout = open(fresult,'w+')
wr = csv.writer(fout, quoting=csv.QUOTE_ALL)
#adapt the encoding according to the input file
fh = codecs.open(fname, encoding='ISO-8859-1')
global idSet
idSet = set(ids)
print("Name , SIB, Uni , Paper IDs", file=fout)
# for each GL, find publications with and without SIB affiliation
for lines in fh:
line = lines.strip().split(',')
#exclude special rows
if line[0].startswith('#') or line[0].startswith('SIB') or line[0].startswith('"(AUTH'): continue
auth = '%22' + line[0].replace(" ","%20") + '%22'
aff1 = '%22Swiss%20Institute%20of%20Bioinformatics%22'
aff2 = '%22' + line[1].replace(" ","%20") + '%22'
fpdate = '[' + str(year) + '-01-01%20TO%20' + str(year) + '-12-31]'
#Download the xml and parse it
prefix = 'http://www.ebi.ac.uk/europepmc/webservices/rest/search?query='
#The following lines _do not work_!
#url1 = prefix + 'AUTH:' + auth + ' AFF:' + aff1 + ' FIRST_PDATE:' + fpdate
#url1 = prefix + 'auth:' + auth + ' aff:' + aff1 + ' first_pdate:' + fpdate
#url1 = prefix + 'auth:' + auth + 'aff:' + aff1 + 'first_pdate:' + fpdate
url1 = prefix + 'AUTH:' + auth + '%20AFF:' + aff1 + '%20FIRST_PDATE:' + fpdate
url2 = prefix + 'AUTH:' + auth + '%20AFF:' + aff2 + '%20FIRST_PDATE:' + fpdate
u1 = urlopen(url1)
u2 = urlopen(url2)
#print(url1)
#print(url2)
#example working url below
#u = urlopen('http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=AUTH:"Stockinger%20H"AFF:"swiss%20institute%20of%20bioinformatics"FIRST_PDATE:[2016-01-01%20TO%202016-12-31]')
doc = parse(u1)
vava = doc.findall("request")
#for truc in vava:
#print(truc.find('query').text)
lst1 = doc.findall("resultList/result")
ids1 = list()
for item in lst1:
ids1.append(item.find('id').text)
doc2 = parse(u2)
lst2 = doc2.findall("resultList/result")
ids2 = list()
for item in lst2:
ids2.append(item.find('id').text)
#print(line[0])
#print('counts:', len(ids1), '//ids1:', ids1)
#print('counts:', len(ids2), '//ids2:', ids2)
un = set(ids1)
deux = set(ids2)
#print('counts union:', len(un | deux), '-> union:', un | deux)
#print('counts intersection:', len(un & deux),'-> intersection:', un & deux)
#print('counts only local:', len(deux - un), '-> only local:', deux - un)
#print('In ids2 but not in ids:', deux.difference(idSet))
#print('Len(idSet)' , len(idSet))
idSet = idSet.union(deux)
print('Len(idSet)' , len(idSet))
# exit(2)
mylist=list()
mylist.append(line[0])
mylist.append(len(ids1))
mylist.append(len(ids2))
mylist.extend(list(un | deux))
wr.writerow(mylist)
fout.close()
def validateHtml():
htmlFile = resultDirName + '/' + resultFilePrefix + 'all-SIB-publications.txt'
htmlHandle = open(htmlFile, "r")
htmlString = htmlHandle.read()
htmlHandle.close()
tidyoptions={
"indent": "auto",
"indent-spaces": 2,
"wrap": 72,
"markup": True,
"output-xml": False,
"input-xml": False,
"show-warnings": True,
"numeric-entities": True,
"quote-marks": True,
"quote-nbsp": True,
"quote-ampersand": False,
"break-before-br": False,
"uppercase-tags": False,
'uppercase-attributes': False
}
htmlString, errors = tidy_fragment(htmlString,tidyoptions)
if errors:
return False
return True
# Retrieve all publications with the following fields:
# authorString, title, journalTitle, issue, journalVolume, pubYear, journalIssn, pageInfo, doi
# import list of IDs from results/all-IDs.txt
def writeHtml():
#global year
fallIds = resultDirName + '/' + resultFilePrefix + 'all-IDs.txt'
fh = open(fallIds)
extIds = list()
for line in fh:
extIds.append(line.strip())
fh.close()
# print extIds, len(extIds), len(sorted(set(extIds)))
query = 'http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:'
listHtml = []
# query EPBMC with id and store required fields
for ext_id in extIds:
u = urlopen(query + ext_id)
doc = parse(u)
# resultList/result are xml tags
lst = doc.findall("resultList/result")
for item in lst:
rA = ''
rT = ''
rJT = ''
rJV = ''
rPY = ''
rIssue = ''
rPI = ''
rDOIstr = ''
rDOI = ''
try:
if (item.find('id').text == ext_id):
logger.info("ok: %s", ext_id)
except:
print('sent: ' + ext_id + 'received: ' + item.find('id').text)
sys.exit(1)
try:
# cases with more than 6 authors => add the more/less toggle functionality
authStr = item.find('authorString').text
auths = authStr.strip().split(',')
if len(auths) > 6 :
auths_pre = auths[0:6]
auths_pre = ','.join(auths_pre)
auths_post = ['<span class="toggleme">'] + auths[6:]
auths_post = ','.join(auths_post)
authsList = [auths_pre, auths_post]
authsList.extend('</span>')
rA = ''.join(authsList)
else:
rA = item.find('authorString').text
except:
rA = ''
try:
rT = item.find('title').text
except:
rT = ''
try:
rJT = item.find('journalTitle').text
except:
rJT = ''
try:
rPY = item.find('pubYear').text
except:
rPY = ''
try:
rJV = item.find('journalVolume').text
if rJV is not None:
rJV = ';' + rJV
else:
rJV = ''
except:
rJV = ''
try:
rIssue = item.find('issue').text
if rIssue is not None:
rIssue = '(' + rIssue + ')'
else:
rIssue = ''
except:
rIssue = ''
try:
rPI = item.find('pageInfo').text
if rPI is not None:
rPI = ':' + rPI
else:
rPI = ''
except:
rPI = ''
try:
rDOIstr = item.find('doi').text
rDOI = ' <a href="https://doi.org/' + rDOIstr + '" target="_blank">'
# logger.warning('DOI ok: %s | %s | %s', rDOIstr, rDOI, ext_id)
except Exception:
# print("DOI not found, using pmid: " + ext_id)
logger.warning('DOI problem before: %s - %s - %s', rDOIstr, rDOI, ext_id)
rDOIstr = ext_id
rDOI = ' <a href="http://europepmc.org/search?query=' + rDOIstr + '" target="_blank">'
logger.warning('DOI problem after: %s - %s - %s', rDOIstr, rDOI, ext_id)
listHtml.append(rA + rDOI + rT + "</a>" + " <em>" + rJT + "</em>"+ " " + rPY + rJV + rIssue + rPI)
# sort a UTF-8 list case insensitive
collator = PyICU.Collator.createInstance(PyICU.Locale('en_GB.UTF-8'))
listHtmlSorted = sorted(listHtml, key=cmp_to_key(collator.compare))
# store publications html info into file
fhtmlResult = resultDirName + '/' + resultFilePrefix + 'all-SIB-publications.txt'
fout = open(fhtmlResult,'w+')
print("<h1>Publications by SIB Members published in ",year," </h1>", file=fout)
from datetime import datetime
print("<h6>Listed in <a href='https://europepmc.org' target='_blank'>Europe PMC</a>, <i>latest update: " + datetime.now().strftime('%Y-%m-%d') + "</i>. </h6>", file=fout)
print("<h2>Peer-reviewed articles and conference proceedings</h2>", file=fout)
print('<p>Search <input type="text"></p>',file=fout)
print("<ol id=\"sibpublis\">", file=fout)
for line in listHtmlSorted:
print("<li class=\"sibpubli\">" + line + "</li>", file=fout)
print("</ol>", file=fout)
# add the 'other publications' section
#print(str(otherPub[0]), file=fout)
fout.close()
#
# Main program
#
if __name__ == "__main__":
ids = list()
idSet = set()
os.chdir(os.path.dirname(os.path.realpath(__file__)))
otherPub = None
today = str(datetime.date.today()).replace('-','')
FORMAT = '%(asctime)-15s %(clientip)s %(user)-8s %(message)s'
# logging.basicConfig(format=FORMAT)
logger = logging.getLogger('main')
logger.setLevel(logging.DEBUG)
# argv[1] is optional parameter for year
if len(sys.argv) >= 2:
try:
year = int(sys.argv[1])
print(year)
if year < 1998 or year > 2020:
sys.exit(1)
except:
print("Year must be an integer between 1998 and 2020")
print(sys.argv[0], sys.argv[1])
sys.exit(1)
else:
year = int(today[:4]);
resultDirName = 'results-' + str(year) + '_' + today
resultFilePrefix = str(year) + '_' + today + '-'
if not os.path.exists(resultDirName):
os.mkdir(resultDirName)
#getOtherPublications()
getPublicationsWithSIBAffilliation(year)
readIDs()
getPublicationsForGroupLeaders(year)
print('\nNumber of publications obtained:', len(idSet))
writeAllIDs()
writeHtml()
if validateHtml() is False:
print ("Malformatted HTML file")
sys.exit(1)
else:
print ("Valid HTML")
sys.exit(0)

Event Timeline