#By doing this like below, characters standing alone, like c a b is not added to the inedx, but when they are together with characters like c++ or c$ they are added.
for word in split(phrase):
if not stopwords.has_key(word) and check_term(word, 0):
if lang and lang !="none" and stemmer.has_key(lang):
write_message("Stemmer for language '%s' initiated" % tag[2])
elif tag[2] != "none" and config.get(function,"stem_if_avail") == "yes" and not stemmer.has_key(tag[2]):
write_message("Warning: Language '%s' not available in PyStemmer." % tag[2])
except NameError:
if shown_error == 0:
write_message("Warning: PyStemmer not found. Please read INSTALL.")
shown_error = 1
tags.append(tag)
i += 1
except Exception:
write_message("Could not read data from configuration file, please check for errors")
raise StandardError
return (stopwords, tags)
def get_valid_range(rank_method_code):
"""Returns which records are valid for this rank method, according to which collections it is enabled for."""
#if options["verbose"] >=9:
# write_message("Getting records from collections enabled for rank method.")
#res = run_sql("SELECT collection.name FROM collection,collection_rnkMETHOD,rnkMETHOD WHERE collection.id=id_collection and id_rnkMETHOD=rnkMETHOD.id and rnkMETHOD.name='%s'" % rank_method_code)
write_message("%s/%s---%s---%s" % (terms[i][0],terms[i][1], Gi[i][0],Gi[len(Gi) - i - 1][0]))
def update_rnkWORD(table, terms):
"""Updates rnkWORDF and rnkWORDR with Gi and Nj values. For each term in rnkWORDF, a Gi value for the term is added. And for each term in each document, the Nj value for that document is added. In rnkWORDR, the Gi value for each term in each document is added. For description on how things are computed, look in the hacking docs.
table - name of forward index to update
terms - modified terms"""
stime = time.time()
Gi = {}
Nj = {}
N = run_sql("select count(id_bibrec) from %sR" % table[:-1])[0][0]
if len(terms) == 0:
return ""
write_message("Beginning post-processing of %s terms" % len(terms))
#Locating all documents related to the modified/new/deleted terms, if fast update,
#only take into account new/modified occurences
write_message("Phase 1: Finding records containing modified terms")
terms = terms.keys()
i = 0
while i < len(terms):
current_terms = ""
for j in range(i, ((i+5000)< len(terms) and (i+5000) or len(terms))):
current_terms += "'%s'," % terms[j]
terms_docs = run_sql("SELECT term, hitlist FROM %s WHERE term in (%s)" % (table,current_terms[:-1]))
for (t, hitlist) in terms_docs:
term_docs = deserialize_via_marshal(hitlist)
if term_docs.has_key("Gi"):
del term_docs["Gi"]
for (j, tf) in term_docs.iteritems():
if (options["quick"] == "yes" and tf[1] == 0) or options["quick"] == "no":
Nj[j] = 0
write_message("Phase 1: ......processed %s/%s terms" % ((i+5000>len(terms) and len(terms) or (i+5000)), len(terms)))
i += 5000
write_message("Phase 1: Finished finding records containing modified terms")
#Find all terms in the records found in last phase
write_message("Phase 2: Finding all terms in affected records")
records = Nj.keys()
i = 0
while i < len(records):
current_recs = "%s" % records[i:i+5000]
docs_terms = run_sql("SELECT id_bibrec, termlist FROM %sR WHERE id_bibrec in (%s)" % (table[:-1],current_recs[1:-1]))
for (j, termlist) in docs_terms:
doc_terms = deserialize_via_marshal(termlist)
for (t, tf) in doc_terms.iteritems():
Gi[t] = 0
write_message("Phase 2: ......processed %s/%s records " % ((i+5000>len(records) and len(records) or (i+5000)), len(records)))
i += 5000
write_message("Phase 2: Finished finding all terms in affected records")
terms = Gi.keys()
Gi = {}
i = 0
if options["quick"] == "no":
#Calculating Fi and Gi value for each term
write_message("Phase 3: Calculating importance of all affected terms")
while i < len(terms):
current_terms = ""
for j in range(i, ((i+5000)< len(terms) and (i+5000) or len(terms))):
current_terms += "'%s'," % terms[j]
terms_docs = run_sql("SELECT term, hitlist FROM %s WHERE term in (%s)" % (table,current_terms[:-1]))