#By doing this like below, characters standing alone, like c a b is not added to the inedx, but when they are together with characters like c++ or c$ they are added.
for word in split(phrase):
if options["remove_stopword"] == "True" and not is_stopword(word, 1) and check_term(word, 0):
if lang and lang !="none" and options["use_stemming"]:
word = stem(word, lang)
if not words.has_key(word):
words[word] = (0,0)
words[word] = (words[word][0] + weight, 0)
elif options["remove_stopword"] == "True" and not is_stopword(word, 1):
write_message("Exception caught: %s" % e, sys.stderr)
if options["verbose"] >= 9:
traceback.print_tb(sys.exc_info()[2])
sys.exit(1)
wordTable.report_on_table_consistency()
# We are done. State it in the database, close and quit
return 1
def get_tags(config):
"""Get the tags that should be used creating the index and each tag's parameter"""
tags = []
function = config.get("rank_method","function")
i = 1
shown_error = 0
#try:
if 1:
while config.has_option(function,"tag%s"% i):
tag = config.get(function, "tag%s" % i)
tag = string.split(tag, ",")
tag[1] = int(string.strip(tag[1]))
tag[2] = string.strip(tag[2])
#check if stemmer for language is available
if config.get(function,"stemming") and stem("information", "en") != "inform":
if shown_error == 0:
write_message("Warning: PyStemmer not found. Please read INSTALL.")
shown_error = 1
elif tag[2] and tag[2] != "none" and config.get(function,"stemming") and not is_stemmer_available_for_language(tag[2]):
write_message("Warning: Language '%s' not available in PyStemmer." % tag[2])
tags.append(tag)
i += 1
#except Exception:
# write_message("Could not read data from configuration file, please check for errors")
# raise StandardError
return tags
def get_valid_range(rank_method_code):
"""Returns which records are valid for this rank method, according to which collections it is enabled for."""
#if options["verbose"] >=9:
# write_message("Getting records from collections enabled for rank method.")
#res = run_sql("SELECT collection.name FROM collection,collection_rnkMETHOD,rnkMETHOD WHERE collection.id=id_collection and id_rnkMETHOD=rnkMETHOD.id and rnkMETHOD.name='%s'" % rank_method_code)
write_message("%s/%s---%s---%s" % (terms[i][0],terms[i][1], Gi[i][0],Gi[len(Gi) - i - 1][0]))
def update_rnkWORD(table, terms):
"""Updates rnkWORDF and rnkWORDR with Gi and Nj values. For each term in rnkWORDF, a Gi value for the term is added. And for each term in each document, the Nj value for that document is added. In rnkWORDR, the Gi value for each term in each document is added. For description on how things are computed, look in the hacking docs.
table - name of forward index to update
terms - modified terms"""
stime = time.time()
Gi = {}
Nj = {}
N = run_sql("select count(id_bibrec) from %sR" % table[:-1])[0][0]
if len(terms) == 0 and options["quick"] == "yes":
write_message("No terms to process, ending...")
return ""
elif options["quick"] == "yes": #not used -R option, fast calculation (not accurate)
write_message("Beginning post-processing of %s terms" % len(terms))
#Locating all documents related to the modified/new/deleted terms, if fast update,
#only take into account new/modified occurences
write_message("Phase 1: Finding records containing modified terms")
terms = terms.keys()
i = 0
while i < len(terms):
terms_docs = get_from_forward_index(terms, i, (i+5000), table)
for (t, hitlist) in terms_docs:
term_docs = deserialize_via_marshal(hitlist)
if term_docs.has_key("Gi"):
del term_docs["Gi"]
for (j, tf) in term_docs.iteritems():
if (options["quick"] == "yes" and tf[1] == 0) or options["quick"] == "no":
Nj[j] = 0
write_message("Phase 1: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms)))
i += 5000
write_message("Phase 1: Finished finding records containing modified terms")
#Find all terms in the records found in last phase
write_message("Phase 2: Finding all terms in affected records")
records = Nj.keys()
i = 0
while i < len(records):
docs_terms = get_from_reverse_index(records, i, (i + 5000), table)
for (j, termlist) in docs_terms:
doc_terms = deserialize_via_marshal(termlist)
for (t, tf) in doc_terms.iteritems():
Gi[t] = 0
write_message("Phase 2: ......processed %s/%s records " % ((i+5000>len(records) and len(records) or (i+5000)), len(records)))
i += 5000
write_message("Phase 2: Finished finding all terms in affected records")
else: #recalculate
max_id = run_sql("SELECT MAX(id) FROM %s" % table)
max_id = max_id[0][0]
write_message("Beginning recalculation of %s terms" % max_id)