diff --git a/modules/bibclassify/lib/bibclassifylib.py b/modules/bibclassify/lib/bibclassifylib.py index d94b3869c..5b595163d 100644 --- a/modules/bibclassify/lib/bibclassifylib.py +++ b/modules/bibclassify/lib/bibclassifylib.py @@ -1,805 +1,805 @@ # -*- coding: utf-8 -*- ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ Bibclassify keyword extractor command line entry point. """ __revision__ = "$Id$" import getopt import string import os import re import sys import time import copy import shelve from invenio.bibtask import write_message # Please point the following variables to the correct paths if using standalone (Invenio-independent) version TMPDIR_STANDALONE = "/tmp" PDFTOTEXT_STANDALONE = "/usr/bin/pdftotext" fontSize = [12, 14, 16, 18, 20, 22, 24, 26, 28, 30] def usage(code, msg=''): "Prints usage for this module." if msg: sys.stderr.write("Error: %s.\n" % msg) usagetext = """ Usage: bibclassify [options] Examples: bibclassify -f file.pdf -k thesaurus.txt -o TEXT bibclassify -f file.txt -K taxonomy.rdf -l 120 -m FULL Specific options: -f, --file=FILENAME name of the file to be classified (Use '.pdf' extension for PDF files; every other extension is treated as text) -k, --thesaurus=FILENAME name of the text thesaurus (one keyword per line) -K, --taxonomy=FILENAME name of the RDF SKOS taxonomy/ontology (a local file or URL) -o, --output=HTML|TEXT|MARCXML output list of keywords in either HTML, text, or MARCXML -l, --limit=INTEGER maximum number of keywords that will be processed to generate results (the higher the l, the higher the number of possible composite keywords) -n, --nkeywords=INTEGER maximum number of single keywords that will be generated -m, --mode=FULL|PARTIAL processing mode: PARTIAL (run on abstract and selected pages), FULL (run on whole document - more accurate, but slower) -q, --spires outputs composite keywords in the SPIRES standard format (ckw1, ckw2) General options: -h, --help print this help and exit -V, --version print version and exit -v, --verbose=LEVEL Verbose level (0=min, 1=default, 9=max). """ sys.stderr.write(usagetext) sys.exit(code) def generate_keywords(textfile, dictfile, verbose=0): """ A method that generates a sorted list of keywords of a document (textfile) based on a simple thesaurus (dictfile). """ keylist = [] keyws = [] wordlista = os.popen("more " + dictfile) thesaurus = [x[:-1] for x in wordlista.readlines()] for keyword in thesaurus: try: string.atoi(keyword) except ValueError: dummy = 1 else: continue if len(keyword)<=1: #whitespace or one char - get rid of continue else: dictOUT = os.popen('grep -iwc "' +keyword.strip()+'" '+textfile).read() try: occur = int(dictOUT) if occur != 0: keylist.append([occur, keyword]) except ValueError: continue keylist.sort() keylist.reverse() for item in keylist: keyws.append(item[1]) return keyws def generate_keywords_rdf(textfile, dictfile, output, limit, nkeywords, mode, spires, verbose=0, ontology=None): """ A method that generates a sorted list of keywords (text or html output) based on a RDF thesaurus. """ import rdflib keylist = [] ckwlist = {} outlist = [] compositesOUT = [] compositesTOADD = [] keys2drop = [] raw = [] composites = {} compositesIDX = {} text_out = "" html_out = [] store = None reusing_compiled_ontology_p = False compiled_ontology_db = None compiled_ontology_db_file = dictfile + '.db' namespace = rdflib.Namespace("http://www.w3.org/2004/02/skos/core#") if not(os.access(dictfile,os.F_OK) and os.access(compiled_ontology_db_file,os.F_OK) and os.path.getmtime(compiled_ontology_db_file) > os.path.getmtime(dictfile)): # changed graph type, recommended by devel team store = rdflib.ConjunctiveGraph() store.parse(dictfile) compiled_ontology_db = shelve.open(compiled_ontology_db_file) compiled_ontology_db['graph'] = store if verbose >= 3: write_message("Creating compiled ontology %s for the first time" % compiled_ontology_db_file, sys.stderr) else: if verbose >= 3: write_message("Reusing compiled ontology %s" % compiled_ontology_db_file, sys.stderr) reusing_compiled_ontology_p = True compiled_ontology_db = shelve.open(compiled_ontology_db_file) store = compiled_ontology_db['graph'] size = int(os.stat(textfile).st_size) rtmp = open(textfile, 'r') atmp = open(textfile, 'r') # ASSUMPTION: Guessing that the first 10% of file contains title and abstract abstract = " " + str(atmp.read(int(size*0.1))) + " " if mode == 1: # Partial mode: analysing only abstract + title + middle portion of document # Abstract and title is generally never more than 20% of whole document. text_string = " " + str(rtmp.read(int(size*0.2))) throw_away = str(rtmp.read(int(size*0.25))) text_string += str(rtmp.read(int(size*0.2))) else: # Full mode: get all document text_string = " " + str(rtmp.read()) + " " atmp.close() rtmp.close() try: # Here we are trying to match the human-assigned keywords # These are generally found in a document after the key phrase "keywords" or similar if text_string.find("Keywords:"): safe_keys = text_string.split("Keywords:")[1].split("\n")[0] elif text_string.find("Key words:"): safe_keys = text_string.split("Key words:")[1].split("\n")[0] elif text_string.find("Key Words:"): safe_keys = text_string.split("Key Words:")[1].split("\n")[0] except: safe_keys = "" if safe_keys != "": write_message("Author keyword string detected: %s" % safe_keys, verbose=8) # Here we start the big for loop around all concepts in the RDF ontology if not reusing_compiled_ontology_p: # we have to compile ontology first: for s,pref in store.subject_objects(namespace["prefLabel"]): dictOUT = 0 safeOUT = 0 hideOUT = 0 candidates = [] wildcard = "" regex = False nostandalone = False # For each concept, we gather the candidates (i.e. prefLabel, hiddenLabel and altLabel) candidates.append(pref.strip()) # If the candidate is a ckw and it has no altLabel, we are not interested at this point, go to the next item if store.value(s,namespace["compositeOf"],default=False,any=True) and not store.value(s,namespace["altLabel"],default=False,any=True): continue if str(store.value(s,namespace["note"],any=True)) == "nostandalone": nostandalone = True for alt in store.objects(s, namespace["altLabel"]): candidates.append(alt.strip()) for hid in store.objects(s, namespace["hiddenLabel"]): candidates.append(hid.strip()) # We then create a regex pattern for each candidate and we match it in the document # First we match any possible candidate containing regex. These have to be handled a priori # (because they might cause double matching, e.g. "gauge theor*" will match "gauge theory" for candidate in candidates: if candidate.find("/", 0, 1) > -1: # We have a wildcard or other regex, do not escape chars # Wildcards matched with '\w*'. These truncations should go into hidden labels in the ontology regex = True pattern = makePattern(candidate, 3) wildcard = pattern hideOUT += len(re.findall(pattern,text_string)) # print "HIDEOUT: " + str(candidate) + " " + str(hideOUT) for candidate in candidates: # Different patterns are created according to the type of candidate keyword encountered if candidate.find("/", 0, 1) > -1: # We have already taken care of this continue elif regex and candidate.find("/", 0, 1) == -1 and len(re.findall(wildcard," " + candidate + " ")) > 0: # The wildcard in hiddenLabel matches this candidate: skip it # print "\ncase 2 touched\n" continue elif candidate.find("-") > -1: # We have an hyphen -> e.g. "word-word". Look for: "word-word", "wordword", "word word" (case insensitive) pattern = makePattern(candidate, 2) elif candidate[:2].isupper() or len(candidate) < 3: # First two letters are uppercase or very short keyword. This could be an acronym. Better leave case untouched pattern = makePattern(candidate, 1) else: # Let's do some plain case insensitive search pattern = makePattern(candidate, 0) if len(candidate) < 3: # We have a short keyword if len(re.findall(pattern,abstract))> 0: # The short keyword appears in the abstract/title, retain it dictOUT += len(re.findall(pattern,text_string)) safeOUT += len(re.findall(pattern,safe_keys)) else: dictOUT += len(re.findall(pattern,text_string)) safeOUT += len(re.findall(pattern,safe_keys)) dictOUT += hideOUT if dictOUT > 0 and store.value(s,namespace["compositeOf"],default=False,any=True): # This is a ckw whose altLabel occurs in the text ckwlist[s.strip()] = dictOUT elif dictOUT > 0: keylist.append([dictOUT, s.strip(), pref.strip(), safeOUT, candidates, nostandalone]) regex = False keylist.sort() keylist.reverse() compiled_ontology_db['keylist'] = keylist compiled_ontology_db.close() else: # we can reuse compiled ontology: keylist = compiled_ontology_db['keylist'] compiled_ontology_db.close() if limit > len(keylist): limit = len(keylist) if nkeywords > limit: nkeywords = limit # Sort out composite keywords based on limit (default=70) # Work out whether among l single keywords, there are possible composite combinations # Generate compositesIDX dictionary of the form: s (URI) : keylist for i in range(limit): try: if store.value(rdflib.Namespace(keylist[i][1]),namespace["composite"],default=False,any=True): compositesIDX[keylist[i][1]] = keylist[i] for composite in store.objects(rdflib.Namespace(keylist[i][1]),namespace["composite"]): if composites.has_key(composite): composites[composite].append(keylist[i][1]) else: composites[composite]=[keylist[i][1]] elif store.value(rdflib.Namespace(keylist[i][1]),namespace["compositeOf"],default=False,any=True): compositesIDX[keylist[i][1]] = keylist[i] else: outlist.append(keylist[i]) except: write_message("Problem with composites.. : %s" % keylist[i][1]) for s_CompositeOf in composites: if len(composites.get(s_CompositeOf)) > 2: write_message("%s - Sorry! Only composite combinations of max 2 keywords are supported at the moment." % s_CompositeOf) elif len(composites.get(s_CompositeOf)) > 1: # We have a composite match. Need to look for composite1 near composite2 comp_one = compositesIDX[composites.get(s_CompositeOf)[0]][2] comp_two = compositesIDX[composites.get(s_CompositeOf)[1]][2] # Now check that comp_one and comp_two really correspond to ckw1 : ckw2 if store.value(rdflib.Namespace(s_CompositeOf),namespace["prefLabel"],default=False,any=True).split(":")[0].strip() == comp_one: # order is correct searchables_one = compositesIDX[composites.get(s_CompositeOf)[0]][4] searchables_two = compositesIDX[composites.get(s_CompositeOf)[1]][4] comp_oneOUT = compositesIDX[composites.get(s_CompositeOf)[0]][0] comp_twoOUT = compositesIDX[composites.get(s_CompositeOf)[1]][0] else: # reverse order comp_one = compositesIDX[composites.get(s_CompositeOf)[1]][2] comp_two = compositesIDX[composites.get(s_CompositeOf)[0]][2] searchables_one = compositesIDX[composites.get(s_CompositeOf)[1]][4] searchables_two = compositesIDX[composites.get(s_CompositeOf)[0]][4] comp_oneOUT = compositesIDX[composites.get(s_CompositeOf)[1]][0] comp_twoOUT = compositesIDX[composites.get(s_CompositeOf)[0]][0] compOUT = 0 wildcards = [] phrases = [] for searchable_one in searchables_one: # Work out all possible combination of comp1 near comp2 c1 = searchable_one if searchable_one.find("/", 0, 1) > -1: m1 = 3 elif searchable_one.find("-") > -1: m1 = 2 elif searchable_one[:2].isupper() or len(searchable_one) < 3: m1 = 1 else: m1 = 0 for searchable_two in searchables_two: c2 = searchable_two if searchable_two.find("/", 0, 1) > -1: m2 = 3 elif searchable_two.find("-") > -1: m2 = 2 elif searchable_two[:2].isupper() or len(searchable_two) < 3: m2 = 1 else: m2 = 0 c = [c1,c2] m = [m1,m2] patterns = makeCompPattern(c, m) if m1 == 3 or m2 == 3: # One of the composites had a wildcard inside wildcards.append(patterns[0]) wildcards.append(patterns[1]) else: # No wildcards phrase1 = c1 + " " + c2 phrase2 = c2 + " " + c1 phrases.append([phrase1, patterns[0]]) phrases.append([phrase2, patterns[1]]) THIScomp = len(re.findall(patterns[0],text_string)) + len(re.findall(patterns[1],text_string)) compOUT += THIScomp if len(wildcards)>0: for wild in wildcards: for phrase in phrases: if len(re.findall(wild," " + phrase[0] + " ")) > 0: compOUT = compOUT - len(re.findall(phrase[1],text_string)) # Add extra results due to altLabels, calculated in the first part if ckwlist.get(s_CompositeOf, 0) > 0: # Add count and pop the item out of the dictionary compOUT += ckwlist.pop(s_CompositeOf) if compOUT > 0 and spires: # Output ckws in spires standard output mode (,) if store.value(rdflib.Namespace(s_CompositeOf),namespace["spiresLabel"],default=False,any=True): compositesOUT.append([compOUT, store.value(rdflib.Namespace(s_CompositeOf),namespace["spiresLabel"],default=False,any=True), comp_one, comp_two, comp_oneOUT, comp_twoOUT]) else: compositesOUT.append([compOUT, store.value(rdflib.Namespace(s_CompositeOf),namespace["prefLabel"],default=False,any=True).replace(":",","), comp_one, comp_two, comp_oneOUT, comp_twoOUT]) keys2drop.append(comp_one.strip()) keys2drop.append(comp_two.strip()) elif compOUT > 0: # Output ckws in bibclassify mode (:) compositesOUT.append([compOUT, store.value(rdflib.Namespace(s_CompositeOf),namespace["prefLabel"],default=False,any=True), comp_one, comp_two, comp_oneOUT, comp_twoOUT]) keys2drop.append(comp_one.strip()) keys2drop.append(comp_two.strip()) # Deal with ckws that only occur as altLabels ckwleft = len(ckwlist) while ckwleft > 0: compositesTOADD.append(ckwlist.popitem()) ckwleft = ckwleft - 1 for s_CompositeTOADD, compTOADD_OUT in compositesTOADD: if spires: compositesOUT.append([compTOADD_OUT, store.value(rdflib.Namespace(s_CompositeTOADD),namespace["prefLabel"],default=False,any=True).replace(":",","), "null", "null", 0, 0]) else: compositesOUT.append([compTOADD_OUT, store.value(rdflib.Namespace(s_CompositeTOADD),namespace["prefLabel"],default=False,any=True), "null", "null", 0, 0]) compositesOUT.sort() compositesOUT.reverse() # Some more keylist filtering: inclusion, e.g subtract "magnetic" if have "magnetic field" for i in keylist: pattern_to_match = " " + i[2].strip() + " " for j in keylist: test_key = " " + j[2].strip() + " " if test_key.strip() != pattern_to_match.strip() and test_key.find(pattern_to_match) > -1: keys2drop.append(pattern_to_match.strip()) text_out += "\nComposite keywords:\n" for ncomp, pref_cOf_label, comp_one, comp_two, comp_oneOUT, comp_twoOUT in compositesOUT: safe_comp_mark = " " safe_one_mark = "" safe_two_mark = "" if safe_keys.find(pref_cOf_label)>-1: safe_comp_mark = "*" if safe_keys.find(comp_one)>-1: safe_one_mark = "*" if safe_keys.find(comp_two)>-1: safe_two_mark = "*" raw.append([str(ncomp),str(pref_cOf_label)]) text_out += str(ncomp) + safe_comp_mark + " " + str(pref_cOf_label) + " [" + str(comp_oneOUT) + safe_one_mark + ", " + str(comp_twoOUT) + safe_two_mark + "]\n" if safe_comp_mark == "*": html_out.append([ncomp, str(pref_cOf_label), 1]) else: html_out.append([ncomp, str(pref_cOf_label), 0]) text_out += "\n\nSingle keywords:\n" for i in range(limit): safe_mark = " " try: idx = keys2drop.index(keylist[i][2].strip()) except: idx = -1 if safe_keys.find(keylist[i][2])>-1: safe_mark = "*" if idx == -1 and nkeywords > 0 and not keylist[i][5]: text_out += str(keylist[i][0]) + safe_mark + " " + keylist[i][2] + "\n" raw.append([keylist[i][0], keylist[i][2]]) if safe_mark == "*": html_out.append([keylist[i][0], keylist[i][2], 1]) else: html_out.append([keylist[i][0], keylist[i][2], 0]) nkeywords = nkeywords - 1 if output == 0: # Output some text return text_out elif output == 2: # return marc xml output. xml = "" for key in raw: xml += """ %s BibClassify/%s """ % (key[1],os.path.splitext(os.path.basename(ontology))[0]) return xml else: # Output some HTML html_out.sort() html_out.reverse() return make_tag_cloud(html_out) def make_tag_cloud(entries): """Using the counts for each of the tags, write a simple HTML page to standard output containing a tag cloud representation. The CSS describes ten levels, each of which has differing font-size's, line-height's and font-weight's. """ max_occurrence = int(entries[0][0]) ret = "\n" ret += "\n" ret += "Keyword Cloud\n" ret += "\n" ret += "\n" ret += "\n" ret += "\n" cloud = "" cloud_list = [] cloud += '
' # Generate some ad-hoc count distribution for i in range(0, len(entries)): count = int(entries[i][0]) tag = str(entries[i][1]) color = int(entries[i][2]) if count < (max_occurrence/10): cloud_list.append([tag,0,color]) elif count < (max_occurrence/7.5): cloud_list.append([tag,1,color]) elif count < (max_occurrence/5): cloud_list.append([tag,2,color]) elif count < (max_occurrence/4): cloud_list.append([tag,3,color]) elif count < (max_occurrence/3): cloud_list.append([tag,4,color]) elif count < (max_occurrence/2): cloud_list.append([tag,5,color]) elif count < (max_occurrence/1.7): cloud_list.append([tag,6,color]) elif count < (max_occurrence/1.5): cloud_list.append([tag,7,color]) elif count < (max_occurrence/1.3): cloud_list.append([tag,8,color]) else: cloud_list.append([tag,9,color]) cloud_list.sort() for i in range(0, len(cloud_list)): cloud += ' 0: cloud += 'style="color:red" ' cloud += '> %s ' % cloud_list[i][0] cloud += '
' ret += cloud + '\n' ret += "
\n" ret += "\n" return ret def makeCompPattern(candidates, modes): """Takes a set of two composite keywords (candidates) and compiles a REGEX expression around it, according to the chosen modes for each one: - 0 : plain case-insensitive search - 1 : plain case-sensitive search - 2 : hyphen - 3 : wildcard""" begREGEX = '(?:[^A-Za-z0-9\+-])(' endREGEX = ')(?=[^A-Za-z0-9\+-])' pattern_text = [] patterns = [] for i in range(2): if modes[i] == 0: pattern_text.append(str(re.escape(candidates[i]) + 's?')) if modes[i] == 1: pattern_text.append(str(re.escape(candidates[i]))) if modes[i] == 2: hyphen = True parts = candidates[i].split("-") pattern_string = "" for part in parts: if len(part)<1 or part.find(" ", 0, 1)> -1: # This is not really a hyphen, maybe a minus sign: treat as isupper(). hyphen = False pattern_string = pattern_string + re.escape(part) + "[- \t]?" if hyphen: pattern_text.append(pattern_string) else: pattern_text.append(re.escape(candidates[i])) if modes[i] == 3: pattern_text.append(candidates[i].replace("/","")) pattern_one = re.compile(begREGEX + pattern_text[0] + "s?[ \s,-]*" + pattern_text[1] + endREGEX, re.I) pattern_two = re.compile(begREGEX + pattern_text[1] + "s?[ \s,-]*" + pattern_text[0] + endREGEX, re.I) patterns.append(pattern_one) patterns.append(pattern_two) return patterns def makePattern(candidate, mode): """Takes a keyword (candidate) and compiles a REGEX expression around it, according to the chosen mode: - 0 : plain case-insensitive search - 1 : plain case-sensitive search - 2 : hyphen - 3 : wildcard""" # NB. At the moment, some patterns are compiled having an optional trailing "s". # This is a very basic method to find plurals in English. # If this program is to be used in other languages, please remove the "s?" from the REGEX # Also, inclusion of plurals at the ontology level would be preferred. begREGEX = '(?:[^A-Za-z0-9\+-])(' endREGEX = ')(?=[^A-Za-z0-9\+-])' try: if mode == 0: pattern = re.compile(begREGEX + re.escape(candidate) + 's?' + endREGEX, re.I) if mode == 1: pattern = re.compile(begREGEX + re.escape(candidate) + endREGEX) if mode == 2: hyphen = True parts = candidate.split("-") pattern_string = begREGEX for part in parts: if len(part)<1 or part.find(" ", 0, 1)> -1: # This is not really a hyphen, maybe a minus sign: treat as isupper(). hyphen = False pattern_string = pattern_string + re.escape(part) + "[- \t]?" pattern_string += endREGEX if hyphen: pattern = re.compile(pattern_string, re.I) else: pattern = re.compile(begREGEX + re.escape(candidate) + endREGEX, re.I) if mode == 3: pattern = re.compile(begREGEX + candidate.replace("/","") + endREGEX, re.I) except: - print "Invalid thesaurus term: " + re.escape(candidate) + "
" + print "Invalid thesaurus term: " + re.escape(candidate) + "
" return pattern def profile(t="", d=""): import profile import pstats profile.run("generate_keywords_rdf(textfile='%s',dictfile='%s')" % (t, d), "bibclassify_profile") p = pstats.Stats("bibclassify_profile") p.strip_dirs().sort_stats("cumulative").print_stats() return 0 def main(): """Main function """ global options long_flags =["file=", "thesaurus=","ontology=", "output=","limit=", "nkeywords=", "mode=", "spires", "help", "version"] short_flags ="f:k:K:o:l:n:m:qhVv:" spires = False limit = 70 nkeywords = 25 input_file = "" dict_file = "" output = 0 mode = 0 verbose = 0 try: opts, args = getopt.getopt(sys.argv[1:], short_flags, long_flags) except getopt.GetoptError, err: write_message(err, sys.stderr) usage(1) if args: usage(1) try: from invenio.config import CFG_TMPDIR, CFG_PATH_PDFTOTEXT, CFG_VERSION version_bibclassify = 0.1 bibclassify_engine_version = "CDS Invenio/%s bibclassify/%s" % (CFG_VERSION, version_bibclassify) except: CFG_TMPDIR = TMPDIR_STANDALONE CFG_PATH_PDFTOTEXT = PDFTOTEXT_STANDALONE temp_text = CFG_TMPDIR + '/bibclassify.pdftotext.' + str(os.getpid()) try: for opt in opts: if opt == ("-h","") or opt == ("--help",""): usage(1) elif opt == ("-V","") or opt == ("--version",""): print bibclassify_engine_version sys.exit(1) elif opt[0] in [ "-v", "--verbose" ]: verbose = opt[1] elif opt[0] in [ "-f", "--file" ]: if opt[1].find(".pdf")>-1: # Treat as PDF cmd = "%s " % CFG_PATH_PDFTOTEXT + opt[1] + " " + temp_text errcode = os.system(cmd) if errcode == 0 and os.path.exists("%s" % temp_text): input_file = temp_text else: print "Error while running %s.\n" % cmd sys.exit(1) else: # Treat as text input_file = opt[1] elif opt[0] in [ "-k", "--thesaurus" ]: if dict_file=="": dict_file = opt[1] else: print "Either a text thesaurus or an ontology (in .rdf format)" sys.exit(1) elif opt[0] in [ "-K", "--taxonomy" ]: if dict_file=="" and opt[1].find(".rdf")!=-1: dict_file = opt[1] else: print "Either a text thesaurus or an ontology (in .rdf format)" sys.exit(1) elif opt[0] in [ "-o", "--output" ]: try: if str(opt[1]).lower().strip() == "html": output = 1 elif str(opt[1]).lower().strip() == "text": output = 0 elif str(opt[1]).lower().strip() == "marcxml": output = 2 else: write_message('Output mode (-o) can only be "HTML", "TEXT", or "MARCXML". Using default output mode (HTML)') except: write_message('Output mode (-o) can only be "HTML", "TEXT", or "MARCXML". Using default output mode (HTML)') elif opt[0] in [ "-m", "--mode" ]: try: if str(opt[1]).lower().strip() == "partial": mode = 1 elif str(opt[1]).lower().strip() == "full": mode = 0 else: write_message('Processing mode (-m) can only be "PARTIAL" or "FULL". Using default output mode (FULL)') except: write_message('Processing mode (-m) can only be "PARTIAL" or "FULL". Using default output mode (FULL)') elif opt[0] in [ "-q", "--spires" ]: spires = True elif opt[0] in [ "-l", "--limit" ]: try: num = int(opt[1]) if num>1: limit = num else: write_message("Number of keywords for processing (--limit) must be an integer higher than 1. Using default value of 70...") except ValueError: write_message("Number of keywords for processing (-n) must be an integer. Using default value of 70...") elif opt[0] in [ "-n", "--nkeywords" ]: try: num = int(opt[1]) if num>1: nkeywords = num else: write_message("Number of keywords (--nkeywords) must be an integer higher than 1. Using default value of 25...") except ValueError: write_message("Number of keywords (--n) must be an integer. Using default value of 25...") except StandardError, e: write_message(e, sys.stderr) sys.exit(1) if input_file == "" or dict_file == "": write_message("Need to enter the name of an input file AND a thesaurus file \n") usage(1) # Weak method to detect dict_file. Need to improve this (e.g. by looking inside the metadata with rdflib?) if dict_file.find(".rdf")!=-1: outcome = generate_keywords_rdf(input_file, dict_file, output, limit, nkeywords, mode, spires, verbose, dict_file) else: # Treat as text outcome = generate_keywords(input_file, dict_file, verbose) print outcome if limit > len(outcome): limit = len(outcome) if output == 0: for i in range(limit): print outcome[i] else: print "" print "" print "Keywords" print "" print "" print '
' for i in range(limit): - print "" + str(outcome[i]) + "
" + print "" + str(outcome[i]) + "
" print '
' print "
" print "" return if __name__ == '__main__': main() diff --git a/modules/bibedit/web/admin/bibeditadmin.py b/modules/bibedit/web/admin/bibeditadmin.py index 05b5be884..7fea97eb7 100644 --- a/modules/bibedit/web/admin/bibeditadmin.py +++ b/modules/bibedit/web/admin/bibeditadmin.py @@ -1,136 +1,136 @@ ## $Id$ ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """CDS Invenio BibEdit Administrator Interface.""" __revision__ = "$Id$" __lastupdated__ = """$Date$""" from invenio.config import cdslang, weburl from invenio.webpage import page from invenio.webuser import getUid, page_not_authorized from invenio.bibedit_engine import perform_request_index, perform_request_edit, perform_request_submit from invenio.search_engine import record_exists from invenio.access_control_engine import acc_authorize_action from invenio.messages import gettext_set_language, wash_language from invenio.urlutils import wash_url_argument, redirect_to_url -navtrail = """ Admin Area """ % (weburl,) +navtrail = """ Admin Area """ % (weburl,) def index(req, ln=cdslang, recid=None, temp="false", format_tag='marc', edit_tag=None, delete_tag=None, num_field=None, add=0, cancel=0, delete=0 ,confirm_delete=0, **args): """ BibEdit Admin interface. """ ln = wash_language(ln) _ = gettext_set_language(ln) uid = getUid(req) recid = wash_url_argument(recid, "int") add = wash_url_argument(add, "int") cancel = wash_url_argument(cancel, "int") delete = wash_url_argument(delete, "int") confirm_delete = wash_url_argument(confirm_delete, "int") (auth_code, auth_message) = acc_authorize_action(req,'runbibedit') if auth_code == 0: (body, errors, warnings) = perform_request_index(ln, recid, cancel, delete, confirm_delete, uid, temp, format_tag, edit_tag, delete_tag, num_field, add, args) else: return page_not_authorized(req=req, text=auth_message, navtrail=navtrail) if recid != 0: title = _("Record") + " #" + str(recid) if add == 3: title = _("Record %s - Add a field") % ('#' + str(recid)) else: title = _("BibEdit Admin Interface") return page(title = title, body = body, errors = errors, warnings = warnings, uid = getUid(req), language = ln, navtrail = navtrail, lastupdated = __lastupdated__, req = req) def edit(req, recid=None, tag=None, num_field='0', num_subfield=0, format_tag='marc', del_subfield=None, temp="false", add=0, ln=cdslang, **args): """ Edit Field page. """ ln = wash_language(ln) _ = gettext_set_language(ln) uid = getUid(req) recid = wash_url_argument(recid, "int") num_field = wash_url_argument(num_field, "int") add = wash_url_argument(add, "int") num_subfield = wash_url_argument(num_subfield, "int") (auth_code, auth_message) = acc_authorize_action(req,'runbibedit') if (auth_code == 0): if (recid and tag and (record_exists(recid)>0)): (body, errors, warnings) = perform_request_edit(ln, recid, uid, tag, num_field, num_subfield, format_tag, temp, del_subfield, add, args) else: redirect_to_url(req, 'index?ln=' + ln) else: return page_not_authorized(req=req, text=auth_message, navtrail=navtrail) title = _("Edit record %(x_recid)s, field %(x_field)s") % {'x_recid': '#' + str(recid), 'x_field': '#' + str(tag[:3])} if add == 1: title = _("Edit record %(x_recid)s, field %(x_field)s - Add a subfield") % {'x_recid': '#' + str(recid), 'x_field': '#' + str(tag[:3])} return page(title = title, body = body, errors = errors, warnings = warnings, uid = getUid(req), language = ln, navtrail = navtrail, lastupdated = __lastupdated__, req = req) def submit(req, recid='', ln=cdslang): """ Submit temp_record on database. """ ln = wash_language(ln) _ = gettext_set_language(ln) uid = getUid(req) recid = wash_url_argument(recid, "int") (auth_code, auth_message) = acc_authorize_action(req,'runbibedit') if auth_code == 0: if (recid and (record_exists(recid)>0)): (body, errors, warnings) = perform_request_submit(ln, recid) else: redirect_to_url(req, 'index?ln=' + ln) else: return page_not_authorized(req=req, text=auth_message, navtrail=navtrail) return page(title = _("Submit and save record %s") % ('#' + str(recid)), body = body, errors = errors, warnings = warnings, uid = getUid(req), language = ln, navtrail = navtrail, lastupdated = __lastupdated__, req = req) diff --git a/modules/bibformat/lib/bibformat_templates.py b/modules/bibformat/lib/bibformat_templates.py index 44e2de973..8f5aca49e 100644 --- a/modules/bibformat/lib/bibformat_templates.py +++ b/modules/bibformat/lib/bibformat_templates.py @@ -1,2301 +1,2301 @@ # -*- coding: utf-8 -*- ## ## $Id$ ## ## This file is part of CDS Invenio. ## Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008 CERN. ## ## CDS Invenio is free software; you can redistribute it and/or ## modify it under the terms of the GNU General Public License as ## published by the Free Software Foundation; either version 2 of the ## License, or (at your option) any later version. ## ## CDS Invenio is distributed in the hope that it will be useful, but ## WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ## General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with CDS Invenio; if not, write to the Free Software Foundation, Inc., ## 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """HTML Templates for BibFormat administration""" __revision__ = "$Id$" # non Invenio imports import cgi # Invenio imports from invenio.messages import gettext_set_language from invenio.config import weburl, sweburl from invenio.messages import language_list_long from invenio.config import CFG_PATH_PHP class Template: """Templating class, refer to bibformat.py for examples of call""" def tmpl_admin_index(self, ln, warnings, is_admin): """ Returns the main BibFormat admin page. @param ln language @param warnings a list of warnings to display at top of page. None if no warning @param is_admin indicate if user is authorized to use BibFormat @return main BibFormat admin page """ _ = gettext_set_language(ln) # load the right message language out = '' if warnings: out += '''
''' % {'warnings': '
'.join(warnings)} out += '''

This is where you can edit the formatting styles available for the records. ''' if not is_admin: out += '''You need to login to enter. ''' % {'weburl':weburl} out += '''

Manage Format Templates
Define how to format a record.
Manage Output Formats
Define which template is applied to which record for a given output.
Manage Knowledge Bases
Define mappings of values, for standardizing records or declaring often used values.

Format Elements Documentation
Documentation of the format elements to be used inside format templates.
BibFormat Admin Guide
Documentation about BibFormat administration
'''% {'weburl':weburl, 'ln':ln} if CFG_PATH_PHP: #Show PHP admin only if PHP is enabled out += '''

Old BibFormat admin interface (in gray box)

The BibFormat admin interface enables you to specify how the bibliographic data is presented to the end user in the search interface and search results pages. For example, you may specify that titles should be printed in bold font, the abstract in small italic, etc. Moreover, the BibFormat is not only a simple bibliographic data output formatter, but also an automated link constructor. For example, from the information on journal name and pages, it may automatically create links to publisher's site based on some configuration rules.

Configuring BibFormat

By default, a simple HTML format based on the most common fields (title, author, abstract, keywords, fulltext link, etc) is defined. You certainly want to define your own ouput formats in case you have a specific metadata structure.

Here is a short guide of what you can configure:

Define one or more output BibFormat behaviours. These are then passed as parameters to the BibFormat modules while executing formatting. -
Example: You can tell BibFormat that is has to enrich the +
Example: You can tell BibFormat that is has to enrich the incoming metadata file by the created format, or that it only has to print the format out.
Extraction Rules
Define how the metadata tags from input are mapped into internal BibFormat variable names. The variable names can afterwards be used in formatting and linking rules. -
Example: You can tell that 100 $a field +
Example: You can tell that 100 $a field should be mapped into $100.a internal variable that you could use later.
Link Rules
Define rules for automated creation of URI links from mapped internal variables. -
Example: You can tell a rule how to create a link to +
Example: You can tell a rule how to create a link to People database out of the $100.a internal variable repesenting author's name. (The $100.a variable was mapped in the previous step, see the Extraction Rules.)
File Formats
Define file format types based on file extensions. This will be used when proposing various fulltext services. -
Example: You can tell that *.pdf files will +
Example: You can tell that *.pdf files will be treated as PDF files.
User Defined Functions (UDFs)
Define your own functions that you can reuse when creating your own output formats. This enables you to do complex formatting without ever touching the BibFormat core code. -
Example: You can define a function how to match and +
Example: You can define a function how to match and extract email addresses out of a text file.
Define the output formats, i.e. how to create the output out of internal BibFormat variables that were extracted in a previous step. This is the functionality you would want to configure most of the time. It may reuse formats, user defined functions, knowledge bases, etc. -
Example: You can tell that authors should be printed in +
Example: You can tell that authors should be printed in italic, that if there are more than 10 authors only the first three should be printed, etc.
Knowledge Bases (KBs)
Define one or more knowledge bases that enables you to transform various forms of input data values into the unique standard form on the output. -
Example: You can tell that Phys Rev D and +
Example: You can tell that Phys Rev D and Physical Review D are both the same journal and that these names should be standardized to Phys Rev : D.
Execution Test
Enables you to test your formats on your sample data file. Useful when debugging newly created formats.

To learn more on BibFormat configuration, you can consult the BibFormat Admin Guide.

Running BibFormat

From the Web interface

Run Reformat Records tool. This tool permits you to update stored formats for bibliographic records. -
It should normally be used after configuring BibFormat's Behaviours and Formats. When these are ready, you can choose to rebuild formats for selected collections or you can manually enter a search query and the web interface will accomplish all necessary formatting steps. -
Example: You can request Photo collections to have their HTML brief formats rebuilt, or you can reformat all the records written by Ellis.

From the command-line interface

Consider having an XML MARC data file that is to be uploaded into the CDS Invenio. (For example, it might have been harvested from other sources and processed via BibConvert.) Having configured BibFormat and its default output type behaviour, you would then run this file throught BibFormat as follows:

             $ bibformat < /tmp/sample.xml > /tmp/sample_with_fmt.xml
that would create default HTML formats and would "enrich" the input XML data file by this format. (You would then continue the upload procedure by calling successively BibUpload and BibIndex.)

Now consider a different situation. You would like to add a new possible format, say "HTML portfolio" and "HTML captions" in order to nicely format multiple photographs in one page. Let us suppose that these two formats are called hp and hc and are already loaded in the collection_format table. (TODO: describe how this is done via WebAdmin.) You would then proceed as follows: firstly, you would prepare the corresponding output behaviours called HP and HC (TODO: note the uppercase!) that would not enrich the input file but that would produce an XML file with only 001 and FMT tags. (This is in order not to update the bibliographic information but the formats only.) You would also prepare corresponding formats at the same time. Secondly, you would launch the formatting as follows:

             $ bibformat otype=HP,HC < /tmp/sample.xml > /tmp/sample_fmts_only.xml
that should give you an XML file containing only 001 and FMT tags. Finally, you would upload the formats:
             $ bibupload < /tmp/sample_fmts_only.xml
and that's it. The new formats should now appear in WebSearch.
''' % {'weburl':weburl, 'ln':ln} return out def tmpl_admin_format_template_show_attributes(self, ln, name, description, filename, editable, all_templates=[], new=False): """ Returns a page to change format template name and description If template is new, offer a way to create a duplicate from an existing template @param ln language @param name the name of the format @param description the description of the format @param filename the filename of the template @param editable True if we let user edit, else False @param all_templates a list of tuples (filename, name) of all other templates @param new if True, the format template has just been added (is new) @return editor for 'format' """ _ = gettext_set_language(ln) # load the right message language out = "" out += '''
0. %(close_editor)s  1. %(template_editor)s  2. %(modify_template_attributes)s  3. %(check_dependencies)s 

''' % {'ln':ln, 'menu':_("Menu"), 'filename':filename, 'close_editor': _("Close Editor"), 'modify_template_attributes': _("Modify Template Attributes"), 'template_editor': _("Template Editor"), 'check_dependencies': _("Check Dependencies") } disabled = "" readonly = "" if not editable: disabled = 'disabled="disabled"' readonly = 'readonly="readonly"' out += '''
''' % {'ln':ln, 'filename':filename} if new: #Offer the possibility to make a duplicate of existing format template code out += '''
Make a copy of format template: [?]
''' out += ''' ''' % {"name": name, 'ln':ln, 'filename':filename, 'disabled':disabled, 'readonly':readonly, 'name_label': _("Name"), 'weburl':weburl } out += '''
%(name)s attributes [?]
''' % {"description": description, 'ln':ln, 'filename':filename, 'disabled':disabled, 'readonly':readonly, 'description_label': _("Description"), 'update_format_attributes': _("Update Format Attributes"), 'weburl':weburl } return out def tmpl_admin_format_template_show_dependencies(self, ln, name, filename, output_formats, format_elements, tags): """ Shows the dependencies (on elements) of the given format. @param name the name of the template @param filename the filename of the template @param format_elements the elements (and list of tags in each element) this template depends on @param output_formats the output format that depend on this template @param tags the tags that are called by format elements this template depends on. """ _ = gettext_set_language(ln) # load the right message language out = '''
0. %(close_editor)s  1. %(template_editor)s  2. %(modify_template_attributes)s  3. %(check_dependencies)s 
Output Formats that use %(name)s Format Elements used by %(name)s* All Tags Called*
''' % {'ln':ln, 'filename':filename, 'menu': _("Menu"), 'close_editor': _("Close Editor"), 'modify_template_attributes': _("Modify Template Attributes"), 'template_editor': _("Template Editor"), 'check_dependencies': _("Check Dependencies"), 'name': name } #Print output formats if len(output_formats) == 0: out += '

No output format uses this format template.

' for output_format in output_formats: name = output_format['names']['generic'] filename = output_format['filename'] out += ''' %(name)s''' % {'filename':filename, 'name':name, 'ln':ln} if len(output_format['tags']) > 0: out += "("+", ".join(output_format['tags'])+")" out += "
" #Print format elements (and tags) out += '
' if len(format_elements) == 0: out += '

This format template uses no format element.

' for format_element in format_elements: name = format_element['name'] out += ''' %(name)s''' % {'name':"bfe_"+name.lower(), 'anchor':name.upper(), 'ln':ln} if len(format_element['tags']) > 0: out += "("+", ".join(format_element['tags'])+")" out += "
" #Print tags out += '
' if len(tags) == 0: out += '

This format template uses no tag.

' for tag in tags: out += '''%(tag)s
''' % { 'tag':tag} out += '''
*Note: Some tags linked with this format template might not be shown. Check manually. ''' return out def tmpl_admin_format_template_show(self, ln, name, description, code, filename, ln_for_preview, pattern_for_preview, editable, content_type_for_preview, content_types): """ Returns the editor for format templates. Edit 'format' @param ln language @param format the format to edit @param filename the filename of the template @param ln_for_preview the language for the preview (for bfo) @param pattern_for_preview the search pattern to be used for the preview (for bfo) @param editable True if we let user edit, else False @param code the code of the template of the editor @return editor for 'format' """ _ = gettext_set_language(ln) # load the right message language out = "" # If xsl, hide some options in the menu nb_menu_options = 4 if filename.endswith('.xsl'): nb_menu_options = 2 out += ''' ''' % {'ln': ln, 'filename': filename, 'menu': _("Menu"), 'label_show_doc': _("Show Documentation"), 'label_hide_doc': _("Hide Documentation"), 'close_editor': _("Close Editor"), 'modify_template_attributes': _("Modify Template Attributes"), 'template_editor': _("Template Editor"), 'check_dependencies': _("Check Dependencies"), 'nb_menu_options': nb_menu_options, 'weburl': sweburl or weburl } if not filename.endswith('.xsl'): out +=''' ''' % {'ln': ln, 'filename': filename, 'menu': _("Menu"), 'label_show_doc': _("Show Documentation"), 'label_hide_doc': _("Hide Documentation"), 'close_editor': _("Close Editor"), 'modify_template_attributes': _("Modify Template Attributes"), 'template_editor': _("Template Editor"), 'check_dependencies': _("Check Dependencies"), 'weburl': sweburl or weburl } out +='''
0. %(close_editor)s  1. %(template_editor)s 2. %(modify_template_attributes)s  3. %(check_dependencies)s 
''' % {'ln': ln, 'filename': filename, 'menu': _("Menu"), 'label_show_doc': _("Show Documentation"), 'label_hide_doc': _("Hide Documentation"), 'close_editor': _("Close Editor"), 'modify_template_attributes': _("Modify Template Attributes"), 'template_editor': _("Template Editor"), 'check_dependencies': _("Check Dependencies"), 'weburl': sweburl or weburl } disabled = "" readonly = "" toolbar = """""" % (weburl, ln) if not editable: disabled = 'disabled="disabled"' readonly = 'readonly="readonly"' toolbar = '' #First column: template code and preview out += ''' ''' % {'code':code, 'ln':ln, 'weburl':weburl, 'filename':filename, 'ln_for_preview':ln_for_preview, 'pattern_for_preview':pattern_for_preview } #Second column Print documentation out += '''
Format template code
Elements Documentation
''' % {'weburl':weburl, 'ln':ln} return out def tmpl_admin_format_template_show_short_doc(self, ln, format_elements): """ Prints the format element documentation in a condensed way to display inside format template editor. This page is different from others: it is displayed inside a