Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F99647094
SIB-monthly-publications.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Jan 25, 23:28
Size
8 KB
Mime Type
text/x-python
Expires
Mon, Jan 27, 23:28 (2 d)
Engine
blob
Format
Raw Data
Handle
23838312
Attached To
R2915 eSCT pipeline interoperability
SIB-monthly-publications.py
View Options
#!/Users/vioannid/Documents/PYTPUBLIS3/bin/python
# -*- coding: utf-8 -*-
"""
Program to retrieve SIB publications from EuropePMC.org for a given month
Created on Mon Jun 26 11:22:00 2017
@author: vioannid, hstockin
"""
### to read character encoded files
import
codecs
import
sys
import
os
import
csv
from
pprint
import
pprint
import
datetime
import
calendar
### import xml.etree.ElementTree as ET
from
urllib.request
import
urlopen
from
xml.etree.ElementTree
import
parse
### import regular expressions
import
re
### used for UTF-8 sorting
import
PyICU
from
functools
import
cmp_to_key
### error handling
import
logging
import
traceback
# Retrieve all publications where SIB is mentioned as affiliation in the author list
# @param year,month,month_nb_days of publication
def
getPublicationsWithSIBAffilliation
(
year
,
month
,
month_nb_days
):
print
(
"SIB publications:"
,
year
,
month
,
month_nb_days
)
sib
=
'aff:%22Swiss%20Institute
%20o
f%20Bioinformatics%22'
date
=
'FIRST_PDATE:['
+
str
(
year
)
+
'-'
+
str
(
month
)
.
zfill
(
2
)
+
'-01%20TO%20'
+
str
(
year
)
+
'-'
+
str
(
month
)
.
zfill
(
2
)
+
'-'
+
str
(
month_nb_days
)
+
']'
#date = 'FIRST_PDATE:[' + str(year) + '-05-01%20TO%20' + str(year) + '-05-31]'
page
=
'&cursorMark=*&pageSize=1000'
# pagination default is limited to 25 entries
query
=
'http://www.ebi.ac.uk/europepmc/webservices/rest/search?query='
+
sib
+
date
+
page
print
(
query
)
u
=
urlopen
(
query
)
doc
=
parse
(
u
)
# resultList/result are xml tags
lst
=
doc
.
findall
(
"resultList/result"
)
# hitCount is an xml tag
entries
=
doc
.
findall
(
"hitCount"
)
print
(
'Number of entries found = '
+
entries
[
0
]
.
text
)
if
int
(
entries
[
0
]
.
text
)
>
1000
:
print
(
'Error: more than 1000 entries found - need to extend query'
)
sys
.
exit
(
1
)
# store publications (result) 'id' into file
fresult
=
resultDirName
+
'/'
+
resultFilePrefix
+
'all-SIB-IDs.txt'
fout
=
open
(
fresult
,
'w+'
)
for
item
in
lst
:
ids
.
append
(
item
.
find
(
'id'
)
.
text
)
print
(
item
.
find
(
'id'
)
.
text
,
file
=
fout
)
fout
.
close
()
# Read IDs from a file and assign to ids()
def
readIDs
():
fresult
=
resultDirName
+
'/'
+
resultFilePrefix
+
'all-SIB-IDs.txt'
fh
=
open
(
fresult
)
for
line
in
fh
:
ids
.
append
(
line
.
strip
())
fh
.
close
()
# Retrieve all publications with the following fields:
# authorString, title, journalTitle, issue, journalVolume, pubYear, journalIssn, pageInfo, doi
# import list of IDs from results/all-IDs.txt
def
writeHtml
():
fallIds
=
resultDirName
+
'/'
+
resultFilePrefix
+
'all-SIB-IDs.txt'
fh
=
open
(
fallIds
)
extIds
=
list
()
for
line
in
fh
:
extIds
.
append
(
line
.
strip
())
fh
.
close
()
# print extIds, len(extIds), len(sorted(set(extIds)))
query
=
'http://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:'
listHtml
=
[]
# query EPBMC with id and store required fields
for
ext_id
in
extIds
:
u
=
urlopen
(
query
+
ext_id
)
doc
=
parse
(
u
)
# resultList/result are xml tags
lst
=
doc
.
findall
(
"resultList/result"
)
for
item
in
lst
:
rA
=
''
rT
=
''
rJT
=
''
rJV
=
''
rPY
=
''
rIssue
=
''
rPI
=
''
rDOIstr
=
''
rDOI
=
''
try
:
if
(
item
.
find
(
'id'
)
.
text
==
ext_id
):
logger
.
info
(
"ok:
%s
"
,
ext_id
)
except
:
print
(
'sent: '
+
ext_id
+
'received: '
+
item
.
find
(
'id'
)
.
text
)
sys
.
exit
(
1
)
try
:
# cases with more than 6 authors => add the more/less toggle functionality
authStr
=
item
.
find
(
'authorString'
)
.
text
auths
=
authStr
.
strip
()
.
split
(
','
)
if
len
(
auths
)
>
6
:
auths_pre
=
auths
[
0
:
6
]
auths_pre
=
','
.
join
(
auths_pre
)
auths_post
=
[
'<span class="toggleme">'
]
+
auths
[
6
:]
auths_post
=
','
.
join
(
auths_post
)
authsList
=
[
auths_pre
,
auths_post
]
authsList
.
extend
(
'</span>'
)
rA
=
''
.
join
(
authsList
)
else
:
rA
=
item
.
find
(
'authorString'
)
.
text
except
:
rA
=
''
try
:
rT
=
item
.
find
(
'title'
)
.
text
except
:
rT
=
''
try
:
rJT
=
item
.
find
(
'journalTitle'
)
.
text
except
:
rJT
=
''
try
:
rPY
=
item
.
find
(
'pubYear'
)
.
text
except
:
rPY
=
''
try
:
rJV
=
item
.
find
(
'journalVolume'
)
.
text
if
rJV
is
not
None
:
rJV
=
';'
+
rJV
else
:
rJV
=
''
except
:
rJV
=
''
try
:
rIssue
=
item
.
find
(
'issue'
)
.
text
if
rIssue
is
not
None
:
rIssue
=
'('
+
rIssue
+
')'
else
:
rIssue
=
''
except
:
rIssue
=
''
try
:
rPI
=
item
.
find
(
'pageInfo'
)
.
text
if
rPI
is
not
None
:
rPI
=
':'
+
rPI
else
:
rPI
=
''
except
:
rPI
=
''
try
:
rDOIstr
=
item
.
find
(
'doi'
)
.
text
rDOI
=
' <a href="https://doi.org/'
+
rDOIstr
+
'" target="_blank">'
# logger.warning('DOI ok: %s | %s | %s', rDOIstr, rDOI, ext_id)
except
Exception
:
# print("DOI not found, using pmid: " + ext_id)
logger
.
warning
(
'DOI problem before:
%s
-
%s
-
%s
'
,
rDOIstr
,
rDOI
,
ext_id
)
rDOIstr
=
ext_id
rDOI
=
' <a href="http://europepmc.org/search?query='
+
rDOIstr
+
'" target="_blank">'
logger
.
warning
(
'DOI problem after:
%s
-
%s
-
%s
'
,
rDOIstr
,
rDOI
,
ext_id
)
listHtml
.
append
(
rA
+
rDOI
+
rT
+
"</a>"
+
" <em>"
+
rJT
+
"</em>"
+
" "
+
rPY
+
rJV
+
rIssue
+
rPI
)
# sort a UTF-8 list case insensitive
collator
=
PyICU
.
Collator
.
createInstance
(
PyICU
.
Locale
(
'en_GB.UTF-8'
))
listHtmlSorted
=
sorted
(
listHtml
,
key
=
cmp_to_key
(
collator
.
compare
))
# store publications html info into file
fhtmlResult
=
resultDirName
+
'/'
+
resultFilePrefix
+
'all-SIB-publications.txt'
fout
=
open
(
fhtmlResult
,
'w+'
)
print
(
"<h1>Publications by SIB Members published in "
,
month
,
" </h1>"
,
file
=
fout
)
from
datetime
import
datetime
print
(
"<h6>Listed in <a href='https://europepmc.org' target='_blank'>Europe PMC</a>, <i>latest update: "
+
datetime
.
now
()
.
strftime
(
'%Y-%m-
%d
'
)
+
"</i>. </h6>"
,
file
=
fout
)
print
(
"<h2>Peer-reviewed articles and conference proceedings</h2>"
,
file
=
fout
)
print
(
"<ol id=
\"
sibpublis
\"
>"
,
file
=
fout
)
for
line
in
listHtmlSorted
:
print
(
"<li class=
\"
sibpubli
\"
>"
+
line
+
"</li>"
,
file
=
fout
)
print
(
"</ol>"
,
file
=
fout
)
fout
.
close
()
#
# Main program
#
if
__name__
==
"__main__"
:
ids
=
list
()
idSet
=
set
()
os
.
chdir
(
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
)))
otherPub
=
None
FORMAT
=
'
%(asctime)-15s
%(clientip)s
%(user)-8s
%(message)s
'
# logging.basicConfig(format=FORMAT)
logger
=
logging
.
getLogger
(
'main'
)
logger
.
setLevel
(
logging
.
DEBUG
)
year
=
''
month
=
''
month_nb_days
=
''
today
=
str
(
datetime
.
date
.
today
())
today_time_stamp
=
str
(
today
)
.
replace
(
'-'
,
''
)
# argv[1] is optional parameter for month
if
len
(
sys
.
argv
)
>=
3
:
try
:
year
=
int
(
sys
.
argv
[
1
])
if
year
<
1998
or
year
>
2020
:
sys
.
exit
(
1
)
except
:
print
(
"year must be an integer between 1998 and 2020"
)
print
(
sys
.
argv
[
0
],
sys
.
argv
[
1
],
sys
.
argv
[
2
])
sys
.
exit
(
1
)
try
:
month
=
int
(
sys
.
argv
[
2
])
month_nb_days
=
calendar
.
monthrange
(
year
,
month
)[
1
]
if
month
<
1
or
month
>
12
:
sys
.
exit
(
1
)
except
:
print
(
'month must be an integer between 1 and 12'
)
else
:
year
=
int
(
today
[:
4
]);
# previous month by default
month
=
int
(
today
[
5
:
7
])
-
1
;
month_nb_days
=
calendar
.
monthrange
(
year
,
month
)[
1
]
date_filename
=
datetime
.
datetime
.
strptime
(
today
,
"%Y-%m-
%d
"
)
resultDirName
=
'results_'
+
str
(
year
)
+
'-'
+
str
(
month
)
.
zfill
(
2
)
+
'_'
+
today_time_stamp
resultFilePrefix
=
str
(
year
)
+
'-'
+
str
(
month
)
.
zfill
(
2
)
+
'_'
+
today_time_stamp
+
'-'
if
not
os
.
path
.
exists
(
resultDirName
):
os
.
mkdir
(
resultDirName
)
print
(
'Requested year:'
,
year
,
',month: '
,
month
,
'('
,
month_nb_days
,
'day-s). Timestamp: '
,
today_time_stamp
)
getPublicationsWithSIBAffilliation
(
year
,
month
,
month_nb_days
)
readIDs
()
writeHtml
()
sys
.
exit
(
0
)
Event Timeline
Log In to Comment