#By doing this like below, characters standing alone, like c a b is not added to the inedx, but when they are together with characters like c++ or c$ they are added.
for word in split(phrase):
if options["remove_stopword"] == "True" and not is_stopword_force(word) and check_term(word, 0):
if lang and lang !="none" and options["use_stemming"]:
word = stem_by_lang(word, lang)
if not words.has_key(word):
words[word] = (0,0)
words[word] = (words[word][0] + weight, 0)
elif options["remove_stopword"] == "True" and not is_stopword_force(word):
write_message("Exception caught: %s" % e, sys.stderr)
if options["verbose"] >= 9:
traceback.print_tb(sys.exc_info()[2])
sys.exit(1)
wordTable.report_on_table_consistency()
# We are done. State it in the database, close and quit
return 1
def get_tags(config):
"""Get the tags that should be used creating the index and each tag's parameter"""
tags = []
function = config.get("rank_method","function")
i = 1
shown_error = 0
#try:
if 1:
while config.has_option(function,"tag%s"% i):
tag = config.get(function, "tag%s" % i)
tag = string.split(tag, ",")
tag[1] = int(string.strip(tag[1]))
tag[2] = string.strip(tag[2])
#check if stemmer for language is available
if config.get(function,"stemming") and stem_by_lang("information", "en") != "inform":
if shown_error == 0:
write_message("Warning: PyStemmer not found. Please read INSTALL.")
shown_error = 1
elif tag[2] and tag[2] != "none" and config.get(function,"stemming") and not lang_available(tag[2]):
write_message("Warning: Language '%s' not available in PyStemmer." % tag[2])
tags.append(tag)
i += 1
#except Exception:
# write_message("Could not read data from configuration file, please check for errors")
# raise StandardError
return tags
def get_valid_range(rank_method_code):
"""Returns which records are valid for this rank method, according to which collections it is enabled for."""
#if options["verbose"] >=9:
# write_message("Getting records from collections enabled for rank method.")
#res = run_sql("SELECT collection.name FROM collection,collection_rnkMETHOD,rnkMETHOD WHERE collection.id=id_collection and id_rnkMETHOD=rnkMETHOD.id and rnkMETHOD.name='%s'" % rank_method_code)
write_message("%s/%s---%s---%s" % (terms[i][0],terms[i][1], Gi[i][0],Gi[len(Gi) - i - 1][0]))
def update_rnkWORD(table, terms):
"""Updates rnkWORDF and rnkWORDR with Gi and Nj values. For each term in rnkWORDF, a Gi value for the term is added. And for each term in each document, the Nj value for that document is added. In rnkWORDR, the Gi value for each term in each document is added. For description on how things are computed, look in the hacking docs.
table - name of forward index to update
terms - modified terms"""
stime = time.time()
Gi = {}
Nj = {}
N = run_sql("select count(id_bibrec) from %sR" % table[:-1])[0][0]
write_message("Beginning post-processing of %s terms" % len(terms))
if len(terms) == 0:
write_message("No terms to process, ending...")
return ""
#Locating all documents related to the modified/new/deleted terms, if fast update,
#only take into account new/modified occurences
write_message("Phase 1: Finding records containing modified terms")
terms = terms.keys()
i = 0
while i < len(terms):
terms_docs = get_from_forward_index(terms, i, (i+5000), table)
for (t, hitlist) in terms_docs:
term_docs = deserialize_via_marshal(hitlist)
if term_docs.has_key("Gi"):
del term_docs["Gi"]
for (j, tf) in term_docs.iteritems():
if (options["quick"] == "yes" and tf[1] == 0) or options["quick"] == "no":
Nj[j] = 0
write_message("Phase 1: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms)))
i += 5000
write_message("Phase 1: Finished finding records containing modified terms")
#Find all terms in the records found in last phase
write_message("Phase 2: Finding all terms in affected records")
records = Nj.keys()
i = 0
while i < len(records):
docs_terms = get_from_reverse_index(records, i, (i + 5000), table)
for (j, termlist) in docs_terms:
doc_terms = deserialize_via_marshal(termlist)
for (t, tf) in doc_terms.iteritems():
Gi[t] = 0
write_message("Phase 2: ......processed %s/%s records " % ((i+5000>len(records) and len(records) or (i+5000)), len(records)))
i += 5000
write_message("Phase 2: Finished finding all terms in affected records")
terms = Gi.keys()
Gi = {}
i = 0
if options["quick"] == "no":
#Calculating Fi and Gi value for each term
write_message("Phase 3: Calculating importance of all affected terms")
while i < len(terms):
terms_docs = get_from_forward_index(terms, i, (i+5000), table)