Page MenuHomec4science

cpp_ngram.i
No OneTemporary

File Metadata

Created
Sun, Sep 1, 11:11

cpp_ngram.i

%include "std_wstring.i"
%include "std_string.i"
%include "std_vector.i"
%template(vectori) std::vector<int>;
%module cpp_ngram
%{
extern std::vector<int> unicode_get_totals(std::wstring s1, std::wstring s2, unsigned int n = 3);
extern std::vector<int> ascii_get_totals(std::string s1, std::string s2, unsigned int n = 3);
extern float unicode_compare(std::wstring s1, std::wstring s2, unsigned int n = 3);
extern float ascii_compare(std::string s1, std::string s2, unsigned int n = 3);
%}
extern std::vector<int> unicode_get_totals(std::wstring s1, std::wstring s2, unsigned int n = 3);
extern std::vector<int> ascii_get_totals(std::string s1, std::string s2, unsigned int n = 3);
extern float unicode_compare(std::wstring s1, std::wstring s2, unsigned int n = 3);
extern float ascii_compare(std::string s1, std::string s2, unsigned int n = 3);
%pythoncode %{
def compare(s1, s2, n = 3):
"""
Returns a score between 0.0 and 1.0 comparing n-grams (n defaults to 3)
from strings s1 and s2.
Score is computed as follows:
Let R be the number of ngrams in the "right" string, L be the number of
ngrams in the "left string" and M be the number of common ngrams.
Then we have
score = M*2.0/(R+L)
Example: let's compare "bambam" and "bambim" with trigrams.
"bambam" => b, ba, 2 x bam, amb, mba, am, m
"bambim" => b, ba, bam, amb, mbi, bim, im, m
Here we have R = L = 8.
For the matched ngrams, we have here: b, ba, bam, amb, m, that is M = 5.
Then we have score = 5/8 = 0.625
"""
# Ensure we have unicode strings (or python might crash)
# Shortcut...
if s1 == s2:
return 1.0
# See if we can do it ascii-fashion (faster)
try:
return _cpp_ngram.ascii_compare(s1.encode("ascii"), s2.encode("ascii"))
except:
pass
if type(s1) != unicode:
s1 = s1.decode("utf8")
if type(s2) != unicode:
s2 = s2.decode("utf8")
# Call cpp function
return _cpp_ngram.unicode_compare(s1, s2)
def get_totals(s1, s2, n = 3):
"""
Returns (left_ngrams, right_ngrams, common_ngrams)
"""
# Ensure we have unicode strings (or python might crash)
# See if we can do it ascii-fashion (faster)
try:
return _cpp_ngram.ascii_get_totals(s1.encode("ascii"), s2.encode("ascii"))
except:
pass
if type(s1) != unicode:
s1 = s1.decode("utf8")
if type(s2) != unicode:
s2 = s2.decode("utf8")
# Call cpp function
return _cpp_ngram.unicode_get_totals(s1, s2)
%}

Event Timeline