## CDS Invenio is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 2 of the
## License, or (at your option) any later version.
##
## CDS Invenio is distributed in the hope that it will be useful, but
## WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
## General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with CDS Invenio; if not, write to the Free Software Foundation, Inc.,
## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
"""
Bibclassify keyword extractor command line entry point.
"""
__revision__="$Id$"
importgetopt
importstring
importos
importre
importsys
importtime
importcopy
importshelve
frominvenio.bibtaskimportwrite_message
# Please point the following variables to the correct paths if using standalone (Invenio-independent) version
TMPDIR_STANDALONE="/tmp"
PDFTOTEXT_STANDALONE="/usr/bin/pdftotext"
fontSize=[12,14,16,18,20,22,24,26,28,30]
defusage(code,msg=''):
"Prints usage for this module."
ifmsg:
sys.stderr.write("Error: %s.\n"%msg)
usagetext="""
Usage: bibclassify [options]
Examples:
bibclassify -f file.pdf -k thesaurus.txt -o TEXT
bibclassify -f file.txt -K taxonomy.rdf -l 120 -m FULL
Specific options:
-f, --file=FILENAME name of the file to be classified (Use '.pdf' extension for PDF files; every other extension is treated as text)
-k, --thesaurus=FILENAME name of the text thesaurus (one keyword per line)
-K, --taxonomy=FILENAME name of the RDF SKOS taxonomy/ontology (a local file or URL)
-o, --output=HTML|TEXT|MARCXML output list of keywords in either HTML, text, or MARCXML
-l, --limit=INTEGER maximum number of keywords that will be processed to generate results (the higher the l, the higher the number of possible composite keywords)
-n, --nkeywords=INTEGER maximum number of single keywords that will be generated
-m, --mode=FULL|PARTIAL processing mode: PARTIAL (run on abstract and selected pages), FULL (run on whole document - more accurate, but slower)
-q, --spires outputs composite keywords in the SPIRES standard format (ckw1, ckw2)