Page MenuHomec4science

SIB-monthly-publications.py
No OneTemporary

File Metadata

Created
Sat, Jan 25, 23:28

SIB-monthly-publications.py

#!/Users/vioannid/Documents/PYTPUBLIS3/bin/python
# -*- coding: utf-8 -*-
"""
Program to retrieve SIB publications from EuropePMC.org for a given month
Created on Mon Jun 26 11:22:00 2017
@author: vioannid, hstockin
"""
### to read character encoded files
import codecs
import sys
import os
import csv
from pprint import pprint
import datetime
import calendar
### import xml.etree.ElementTree as ET
from urllib.request import urlopen
from xml.etree.ElementTree import parse
### import regular expressions
import re
### used for UTF-8 sorting
import PyICU
from functools import cmp_to_key
### error handling
import logging
import traceback
# Retrieve all publications where SIB is mentioned as affiliation in the author list
# @param year,month,month_nb_days of publication
def getPublicationsWithSIBAffilliation(year,month,month_nb_days):
print("SIB publications:", year, month, month_nb_days)
sib = 'aff:%22Swiss%20Institute%20of%20Bioinformatics%22'
date = 'FIRST_PDATE:[' + str(year) + '-' + str(month).zfill(2) + '-01%20TO%20' + str(year) + '-' + str(month).zfill(2) + '-' + str(month_nb_days)+ ']'
#date = 'FIRST_PDATE:[' + str(year) + '-05-01%20TO%20' + str(year) + '-05-31]'
page = '&cursorMark=*&pageSize=1000' # pagination default is limited to 25 entries
query = 'http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=' + sib + date + page
print(query)
u = urlopen(query)
doc = parse(u)
# resultList/result are xml tags
lst = doc.findall("resultList/result")
# hitCount is an xml tag
entries = doc.findall("hitCount")
print('Number of entries found = ' + entries[0].text)
if int(entries[0].text) > 1000:
print('Error: more than 1000 entries found - need to extend query')
sys.exit(1)
# store publications (result) 'id' into file
fresult = resultDirName + '/' + resultFilePrefix + 'all-SIB-IDs.txt'
fout = open(fresult,'w+')
for item in lst:
ids.append(item.find('id').text)
print(item.find('id').text, file=fout)
fout.close()
# Read IDs from a file and assign to ids()
def readIDs():
fresult = resultDirName + '/' + resultFilePrefix + 'all-SIB-IDs.txt'
fh = open(fresult)
for line in fh:
ids.append(line.strip())
fh.close()
# Retrieve all publications with the following fields:
# authorString, title, journalTitle, issue, journalVolume, pubYear, journalIssn, pageInfo, doi
# import list of IDs from results/all-IDs.txt
def writeHtml():
fallIds = resultDirName + '/' + resultFilePrefix + 'all-SIB-IDs.txt'
fh = open(fallIds)
extIds = list()
for line in fh:
extIds.append(line.strip())
fh.close()
# print extIds, len(extIds), len(sorted(set(extIds)))
query = 'http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:'
listHtml = []
# query EPBMC with id and store required fields
for ext_id in extIds:
u = urlopen(query + ext_id)
doc = parse(u)
# resultList/result are xml tags
lst = doc.findall("resultList/result")
for item in lst:
rA = ''
rT = ''
rJT = ''
rJV = ''
rPY = ''
rIssue = ''
rPI = ''
rDOIstr = ''
rDOI = ''
try:
if (item.find('id').text == ext_id):
logger.info("ok: %s", ext_id)
except:
print('sent: ' + ext_id + 'received: ' + item.find('id').text)
sys.exit(1)
try:
# cases with more than 6 authors => add the more/less toggle functionality
authStr = item.find('authorString').text
auths = authStr.strip().split(',')
if len(auths) > 6 :
auths_pre = auths[0:6]
auths_pre = ','.join(auths_pre)
auths_post = ['<span class="toggleme">'] + auths[6:]
auths_post = ','.join(auths_post)
authsList = [auths_pre, auths_post]
authsList.extend('</span>')
rA = ''.join(authsList)
else:
rA = item.find('authorString').text
except:
rA = ''
try:
rT = item.find('title').text
except:
rT = ''
try:
rJT = item.find('journalTitle').text
except:
rJT = ''
try:
rPY = item.find('pubYear').text
except:
rPY = ''
try:
rJV = item.find('journalVolume').text
if rJV is not None:
rJV = ';' + rJV
else:
rJV = ''
except:
rJV = ''
try:
rIssue = item.find('issue').text
if rIssue is not None:
rIssue = '(' + rIssue + ')'
else:
rIssue = ''
except:
rIssue = ''
try:
rPI = item.find('pageInfo').text
if rPI is not None:
rPI = ':' + rPI
else:
rPI = ''
except:
rPI = ''
try:
rDOIstr = item.find('doi').text
rDOI = ' <a href="https://doi.org/' + rDOIstr + '" target="_blank">'
# logger.warning('DOI ok: %s | %s | %s', rDOIstr, rDOI, ext_id)
except Exception:
# print("DOI not found, using pmid: " + ext_id)
logger.warning('DOI problem before: %s - %s - %s', rDOIstr, rDOI, ext_id)
rDOIstr = ext_id
rDOI = ' <a href="http://europepmc.org/search?query=' + rDOIstr + '" target="_blank">'
logger.warning('DOI problem after: %s - %s - %s', rDOIstr, rDOI, ext_id)
listHtml.append(rA + rDOI + rT + "</a>" + " <em>" + rJT + "</em>"+ " " + rPY + rJV + rIssue + rPI)
# sort a UTF-8 list case insensitive
collator = PyICU.Collator.createInstance(PyICU.Locale('en_GB.UTF-8'))
listHtmlSorted = sorted(listHtml, key=cmp_to_key(collator.compare))
# store publications html info into file
fhtmlResult = resultDirName + '/' + resultFilePrefix + 'all-SIB-publications.txt'
fout = open(fhtmlResult,'w+')
print("<h1>Publications by SIB Members published in ",month," </h1>", file=fout)
from datetime import datetime
print("<h6>Listed in <a href='https://europepmc.org' target='_blank'>Europe PMC</a>, <i>latest update: " + datetime.now().strftime('%Y-%m-%d') + "</i>. </h6>", file=fout)
print("<h2>Peer-reviewed articles and conference proceedings</h2>", file=fout)
print("<ol id=\"sibpublis\">", file=fout)
for line in listHtmlSorted:
print("<li class=\"sibpubli\">" + line + "</li>", file=fout)
print("</ol>", file=fout)
fout.close()
#
# Main program
#
if __name__ == "__main__":
ids = list()
idSet = set()
os.chdir(os.path.dirname(os.path.realpath(__file__)))
otherPub = None
FORMAT = '%(asctime)-15s %(clientip)s %(user)-8s %(message)s'
# logging.basicConfig(format=FORMAT)
logger = logging.getLogger('main')
logger.setLevel(logging.DEBUG)
year=''
month=''
month_nb_days=''
today = str(datetime.date.today())
today_time_stamp = str(today).replace('-','')
# argv[1] is optional parameter for month
if len(sys.argv) >= 3:
try:
year = int(sys.argv[1])
if year < 1998 or year > 2020:
sys.exit(1)
except:
print("year must be an integer between 1998 and 2020")
print(sys.argv[0], sys.argv[1], sys.argv[2])
sys.exit(1)
try:
month = int(sys.argv[2])
month_nb_days = calendar.monthrange(year, month)[1]
if month < 1 or month > 12:
sys.exit(1)
except:
print('month must be an integer between 1 and 12')
else:
year = int(today[:4]);
# previous month by default
month = int(today[5:7]) - 1;
month_nb_days = calendar.monthrange(year, month)[1]
date_filename = datetime.datetime.strptime(today, "%Y-%m-%d")
resultDirName = 'results_' + str(year) + '-' + str(month).zfill(2) + '_' + today_time_stamp
resultFilePrefix = str(year) + '-' + str(month).zfill(2) + '_' + today_time_stamp + '-'
if not os.path.exists(resultDirName):
os.mkdir(resultDirName)
print ('Requested year:',year, ',month: ',month, '(',month_nb_days,'day-s). Timestamp: ', today_time_stamp)
getPublicationsWithSIBAffilliation(year,month,month_nb_days)
readIDs()
writeHtml()
sys.exit(0)

Event Timeline