Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F90947643
SIB-GL-publications.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, Nov 6, 07:44
Size
13 KB
Mime Type
text/x-python
Expires
Fri, Nov 8, 07:44 (2 d)
Engine
blob
Format
Raw Data
Handle
22165699
Attached To
R2915 eSCT pipeline interoperability
SIB-GL-publications.py
View Options
#!/Users/vioannid/Documents/PYTPUBLIS3/bin/python
# -*- coding: utf-8 -*-
"""
Program to retrieve SIB publications from EuropePMC.org for a given year
Verifies with SIB group leaders and secondary affiliations
Created on Thu Dec 22 09:08:41 2016
@author: vioannid, hstockin
"""
### to read character encoded files
import
codecs
import
sys
import
os
import
csv
from
pprint
import
pprint
import
datetime
### import xml.etree.ElementTree as ET
from
urllib.request
import
urlopen
from
xml.etree.ElementTree
import
parse
### import regular expressions
import
re
### used for UTF-8 sorting
import
PyICU
from
functools
import
cmp_to_key
### error handling
import
logging
import
traceback
### HTML validation using HTML tidy
from
tidylib
import
tidy_document
,
tidy_fragment
# Retrieve the 'Other Articles' section from sib.swiss/scientific-publications
# Save to variable to be used in writeHtml()
# def getOtherPublications():
# url = 'http://www.sib.swiss/research/scientific-publications'
# sock = urlopen(url).read().decode('utf-8')
# pat = re.compile(r"((<!--OTHER-->.*<!--END-OTHER-->))", re.M | re.S)
# global otherPub
#
# try:
# otherPub = pat.search(sock)
#
# except:
# print("Problem with the _Other articles_ sections")
# sys.exit(1)
# Retrieve all publications where SIB is mentioned in the author list
# @param year of publication
def
getPublicationsWithSIBAffilliation
(
year
):
print
(
"SIB publications:"
,
year
)
sib
=
'aff:%22Swiss%20Institute
%20o
f%20Bioinformatics%22'
date
=
'FIRST_PDATE:['
+
str
(
year
)
+
'-01-01%20TO%20'
+
str
(
year
)
+
'-12-31]'
#date = 'FIRST_PDATE:[' + str(year) + '-05-01%20TO%20' + str(year) + '-05-31]'
page
=
'&cursorMark=*&pageSize=1000'
# pagination default is limited to 25 entries
query
=
'http://www.ebi.ac.uk/europepmc/webservices/rest/search?query='
+
sib
+
date
+
page
u
=
urlopen
(
query
)
doc
=
parse
(
u
)
# resultList/result are xml tags
lst
=
doc
.
findall
(
"resultList/result"
)
# hitCount is an xml tag
entries
=
doc
.
findall
(
"hitCount"
)
print
(
'Number of entries found = '
+
entries
[
0
]
.
text
)
if
int
(
entries
[
0
]
.
text
)
>
1000
:
print
(
'Error: more than 1000 entries found - need to extend query'
)
sys
.
exit
(
1
)
# store publications (result) 'id' into file
fresult
=
resultDirName
+
'/'
+
resultFilePrefix
+
'all-SIB-IDs.txt'
fout
=
open
(
fresult
,
'w+'
)
for
item
in
lst
:
ids
.
append
(
item
.
find
(
'id'
)
.
text
)
print
(
item
.
find
(
'id'
)
.
text
,
file
=
fout
)
fout
.
close
()
# Read IDs from a file and assign to ids()
def
readIDs
():
fresult
=
resultDirName
+
'/'
+
resultFilePrefix
+
'all-SIB-IDs.txt'
fh
=
open
(
fresult
)
for
line
in
fh
:
ids
.
append
(
line
.
strip
())
fh
.
close
()
# Write final set of all IDs (stored in idSet)
def
writeAllIDs
():
resultFile
=
resultDirName
+
'/'
+
resultFilePrefix
+
'all-IDs.txt'
fresult
=
resultFile
fout
=
open
(
fresult
,
'w+'
)
for
item
in
idSet
:
print
(
item
,
file
=
fout
)
fout
.
close
()
print
(
'All IDs are written into the following file: '
,
resultFile
)
# Retrieve publications for each of the GLs using their home university
# @param year of publication
def
getPublicationsForGroupLeaders
(
year
):
# import list of GL and affiliation from file
fname
=
'conf/list-of-GLs.csv'
#fname = 'conf/list-of-GLs-test-3.csv'
fresult
=
resultDirName
+
'/'
+
resultFilePrefix
+
'publications-for-GLs.csv'
fout
=
open
(
fresult
,
'w+'
)
wr
=
csv
.
writer
(
fout
,
quoting
=
csv
.
QUOTE_ALL
)
#adapt the encoding according to the input file
fh
=
codecs
.
open
(
fname
,
encoding
=
'ISO-8859-1'
)
global
idSet
idSet
=
set
(
ids
)
print
(
"Name , SIB, Uni , Paper IDs"
,
file
=
fout
)
# for each GL, find publications with and without SIB affiliation
for
lines
in
fh
:
line
=
lines
.
strip
()
.
split
(
','
)
#exclude special rows
if
line
[
0
]
.
startswith
(
'#'
)
or
line
[
0
]
.
startswith
(
'SIB'
)
or
line
[
0
]
.
startswith
(
'"(AUTH'
):
continue
auth
=
'%22'
+
line
[
0
]
.
replace
(
" "
,
"%20"
)
+
'%22'
aff1
=
'%22Swiss%20Institute
%20o
f%20Bioinformatics%22'
aff2
=
'%22'
+
line
[
1
]
.
replace
(
" "
,
"%20"
)
+
'%22'
fpdate
=
'['
+
str
(
year
)
+
'-01-01%20TO%20'
+
str
(
year
)
+
'-12-31]'
#Download the xml and parse it
prefix
=
'http://www.ebi.ac.uk/europepmc/webservices/rest/search?query='
#The following lines _do not work_!
#url1 = prefix + 'AUTH:' + auth + ' AFF:' + aff1 + ' FIRST_PDATE:' + fpdate
#url1 = prefix + 'auth:' + auth + ' aff:' + aff1 + ' first_pdate:' + fpdate
#url1 = prefix + 'auth:' + auth + 'aff:' + aff1 + 'first_pdate:' + fpdate
url1
=
prefix
+
'AUTH:'
+
auth
+
'%20AFF:'
+
aff1
+
'
%20F
IRST_PDATE:'
+
fpdate
url2
=
prefix
+
'AUTH:'
+
auth
+
'%20AFF:'
+
aff2
+
'
%20F
IRST_PDATE:'
+
fpdate
u1
=
urlopen
(
url1
)
u2
=
urlopen
(
url2
)
#print(url1)
#print(url2)
#example working url below
#u = urlopen('http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=AUTH:"Stockinger%20H"AFF:"swiss%20institute%20of%20bioinformatics"FIRST_PDATE:[2016-01-01%20TO%202016-12-31]')
doc
=
parse
(
u1
)
vava
=
doc
.
findall
(
"request"
)
#for truc in vava:
#print(truc.find('query').text)
lst1
=
doc
.
findall
(
"resultList/result"
)
ids1
=
list
()
for
item
in
lst1
:
ids1
.
append
(
item
.
find
(
'id'
)
.
text
)
doc2
=
parse
(
u2
)
lst2
=
doc2
.
findall
(
"resultList/result"
)
ids2
=
list
()
for
item
in
lst2
:
ids2
.
append
(
item
.
find
(
'id'
)
.
text
)
#print(line[0])
#print('counts:', len(ids1), '//ids1:', ids1)
#print('counts:', len(ids2), '//ids2:', ids2)
un
=
set
(
ids1
)
deux
=
set
(
ids2
)
#print('counts union:', len(un | deux), '-> union:', un | deux)
#print('counts intersection:', len(un & deux),'-> intersection:', un & deux)
#print('counts only local:', len(deux - un), '-> only local:', deux - un)
#print('In ids2 but not in ids:', deux.difference(idSet))
#print('Len(idSet)' , len(idSet))
idSet
=
idSet
.
union
(
deux
)
print
(
'Len(idSet)'
,
len
(
idSet
))
# exit(2)
mylist
=
list
()
mylist
.
append
(
line
[
0
])
mylist
.
append
(
len
(
ids1
))
mylist
.
append
(
len
(
ids2
))
mylist
.
extend
(
list
(
un
|
deux
))
wr
.
writerow
(
mylist
)
fout
.
close
()
def
validateHtml
():
htmlFile
=
resultDirName
+
'/'
+
resultFilePrefix
+
'all-SIB-publications.txt'
htmlHandle
=
open
(
htmlFile
,
"r"
)
htmlString
=
htmlHandle
.
read
()
htmlHandle
.
close
()
tidyoptions
=
{
"indent"
:
"auto"
,
"indent-spaces"
:
2
,
"wrap"
:
72
,
"markup"
:
True
,
"output-xml"
:
False
,
"input-xml"
:
False
,
"show-warnings"
:
True
,
"numeric-entities"
:
True
,
"quote-marks"
:
True
,
"quote-nbsp"
:
True
,
"quote-ampersand"
:
False
,
"break-before-br"
:
False
,
"uppercase-tags"
:
False
,
'uppercase-attributes'
:
False
}
htmlString
,
errors
=
tidy_fragment
(
htmlString
,
tidyoptions
)
if
errors
:
return
False
return
True
# Retrieve all publications with the following fields:
# authorString, title, journalTitle, issue, journalVolume, pubYear, journalIssn, pageInfo, doi
# import list of IDs from results/all-IDs.txt
def
writeHtml
():
#global year
fallIds
=
resultDirName
+
'/'
+
resultFilePrefix
+
'all-IDs.txt'
fh
=
open
(
fallIds
)
extIds
=
list
()
for
line
in
fh
:
extIds
.
append
(
line
.
strip
())
fh
.
close
()
# print extIds, len(extIds), len(sorted(set(extIds)))
query
=
'http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:'
listHtml
=
[]
# query EPBMC with id and store required fields
for
ext_id
in
extIds
:
u
=
urlopen
(
query
+
ext_id
)
doc
=
parse
(
u
)
# resultList/result are xml tags
lst
=
doc
.
findall
(
"resultList/result"
)
for
item
in
lst
:
rA
=
''
rT
=
''
rJT
=
''
rJV
=
''
rPY
=
''
rIssue
=
''
rPI
=
''
rDOIstr
=
''
rDOI
=
''
try
:
if
(
item
.
find
(
'id'
)
.
text
==
ext_id
):
logger
.
info
(
"ok:
%s
"
,
ext_id
)
except
:
print
(
'sent: '
+
ext_id
+
'received: '
+
item
.
find
(
'id'
)
.
text
)
sys
.
exit
(
1
)
try
:
# cases with more than 6 authors => add the more/less toggle functionality
authStr
=
item
.
find
(
'authorString'
)
.
text
auths
=
authStr
.
strip
()
.
split
(
','
)
if
len
(
auths
)
>
6
:
auths_pre
=
auths
[
0
:
6
]
auths_pre
=
','
.
join
(
auths_pre
)
auths_post
=
[
'<span class="toggleme">'
]
+
auths
[
6
:]
auths_post
=
','
.
join
(
auths_post
)
authsList
=
[
auths_pre
,
auths_post
]
authsList
.
extend
(
'</span>'
)
rA
=
''
.
join
(
authsList
)
else
:
rA
=
item
.
find
(
'authorString'
)
.
text
except
:
rA
=
''
try
:
rT
=
item
.
find
(
'title'
)
.
text
except
:
rT
=
''
try
:
rJT
=
item
.
find
(
'journalTitle'
)
.
text
except
:
rJT
=
''
try
:
rPY
=
item
.
find
(
'pubYear'
)
.
text
except
:
rPY
=
''
try
:
rJV
=
item
.
find
(
'journalVolume'
)
.
text
if
rJV
is
not
None
:
rJV
=
';'
+
rJV
else
:
rJV
=
''
except
:
rJV
=
''
try
:
rIssue
=
item
.
find
(
'issue'
)
.
text
if
rIssue
is
not
None
:
rIssue
=
'('
+
rIssue
+
')'
else
:
rIssue
=
''
except
:
rIssue
=
''
try
:
rPI
=
item
.
find
(
'pageInfo'
)
.
text
if
rPI
is
not
None
:
rPI
=
':'
+
rPI
else
:
rPI
=
''
except
:
rPI
=
''
try
:
rDOIstr
=
item
.
find
(
'doi'
)
.
text
rDOI
=
' <a href="https://doi.org/'
+
rDOIstr
+
'" target="_blank">'
# logger.warning('DOI ok: %s | %s | %s', rDOIstr, rDOI, ext_id)
except
Exception
:
# print("DOI not found, using pmid: " + ext_id)
logger
.
warning
(
'DOI problem before:
%s
-
%s
-
%s
'
,
rDOIstr
,
rDOI
,
ext_id
)
rDOIstr
=
ext_id
rDOI
=
' <a href="http://europepmc.org/search?query='
+
rDOIstr
+
'" target="_blank">'
logger
.
warning
(
'DOI problem after:
%s
-
%s
-
%s
'
,
rDOIstr
,
rDOI
,
ext_id
)
listHtml
.
append
(
rA
+
rDOI
+
rT
+
"</a>"
+
" <em>"
+
rJT
+
"</em>"
+
" "
+
rPY
+
rJV
+
rIssue
+
rPI
)
# sort a UTF-8 list case insensitive
collator
=
PyICU
.
Collator
.
createInstance
(
PyICU
.
Locale
(
'en_GB.UTF-8'
))
listHtmlSorted
=
sorted
(
listHtml
,
key
=
cmp_to_key
(
collator
.
compare
))
# store publications html info into file
fhtmlResult
=
resultDirName
+
'/'
+
resultFilePrefix
+
'all-SIB-publications.txt'
fout
=
open
(
fhtmlResult
,
'w+'
)
print
(
"<h1>Publications by SIB Members published in "
,
year
,
" </h1>"
,
file
=
fout
)
from
datetime
import
datetime
print
(
"<h6>Listed in <a href='https://europepmc.org' target='_blank'>Europe PMC</a>, <i>latest update: "
+
datetime
.
now
()
.
strftime
(
'%Y-%m-
%d
'
)
+
"</i>. </h6>"
,
file
=
fout
)
print
(
"<h2>Peer-reviewed articles and conference proceedings</h2>"
,
file
=
fout
)
print
(
'<p>Search <input type="text"></p>'
,
file
=
fout
)
print
(
"<ol id=
\"
sibpublis
\"
>"
,
file
=
fout
)
for
line
in
listHtmlSorted
:
print
(
"<li class=
\"
sibpubli
\"
>"
+
line
+
"</li>"
,
file
=
fout
)
print
(
"</ol>"
,
file
=
fout
)
# add the 'other publications' section
#print(str(otherPub[0]), file=fout)
fout
.
close
()
#
# Main program
#
if
__name__
==
"__main__"
:
ids
=
list
()
idSet
=
set
()
os
.
chdir
(
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
)))
otherPub
=
None
today
=
str
(
datetime
.
date
.
today
())
.
replace
(
'-'
,
''
)
FORMAT
=
'
%(asctime)-15s
%(clientip)s
%(user)-8s
%(message)s
'
# logging.basicConfig(format=FORMAT)
logger
=
logging
.
getLogger
(
'main'
)
logger
.
setLevel
(
logging
.
DEBUG
)
# argv[1] is optional parameter for year
if
len
(
sys
.
argv
)
>=
2
:
try
:
year
=
int
(
sys
.
argv
[
1
])
print
(
year
)
if
year
<
1998
or
year
>
2020
:
sys
.
exit
(
1
)
except
:
print
(
"Year must be an integer between 1998 and 2020"
)
print
(
sys
.
argv
[
0
],
sys
.
argv
[
1
])
sys
.
exit
(
1
)
else
:
year
=
int
(
today
[:
4
]);
resultDirName
=
'results-'
+
str
(
year
)
+
'_'
+
today
resultFilePrefix
=
str
(
year
)
+
'_'
+
today
+
'-'
if
not
os
.
path
.
exists
(
resultDirName
):
os
.
mkdir
(
resultDirName
)
#getOtherPublications()
getPublicationsWithSIBAffilliation
(
year
)
readIDs
()
getPublicationsForGroupLeaders
(
year
)
print
(
'
\n
Number of publications obtained:'
,
len
(
idSet
))
writeAllIDs
()
writeHtml
()
if
validateHtml
()
is
False
:
print
(
"Malformatted HTML file"
)
sys
.
exit
(
1
)
else
:
print
(
"Valid HTML"
)
sys
.
exit
(
0
)
Event Timeline
Log In to Comment