Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F121687246
cpp_ngram.i
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Jul 13, 04:17
Size
2 KB
Mime Type
text/x-python
Expires
Tue, Jul 15, 04:17 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
27374352
Attached To
R3597 cpp_ngram
cpp_ngram.i
View Options
%include "std_wstring.i"
%include "std_string.i"
%include "std_vector.i"
%template(vectori) std::vector<int>;
%module cpp_ngram
%{
extern std::vector<int> unicode_get_totals(std::wstring s1, std::wstring s2, unsigned int n = 3);
extern std::vector<int> ascii_get_totals(std::string s1, std::string s2, unsigned int n = 3);
extern float unicode_compare(std::wstring s1, std::wstring s2, unsigned int n = 3);
extern float ascii_compare(std::string s1, std::string s2, unsigned int n = 3);
%}
extern std::vector<int> unicode_get_totals(std::wstring s1, std::wstring s2, unsigned int n = 3);
extern std::vector<int> ascii_get_totals(std::string s1, std::string s2, unsigned int n = 3);
extern float unicode_compare(std::wstring s1, std::wstring s2, unsigned int n = 3);
extern float ascii_compare(std::string s1, std::string s2, unsigned int n = 3);
%pythoncode %{
def compare(s1, s2, n = 3):
"""
Returns a score between 0.0 and 1.0 comparing n-grams (n defaults to 3)
from strings s1 and s2.
Score is computed as follows:
Let R be the number of ngrams in the "right" string, L be the number of
ngrams in the "left string" and M be the number of common ngrams.
Then we have
score = M*2.0/(R+L)
Example: let's compare "bambam" and "bambim" with trigrams.
"bambam" => b, ba, 2 x bam, amb, mba, am, m
"bambim" => b, ba, bam, amb, mbi, bim, im, m
Here we have R = L = 8.
For the matched ngrams, we have here: b, ba, bam, amb, m, that is M = 5.
Then we have score = 5/8 = 0.625
"""
# Ensure we have unicode strings (or python might crash)
# Shortcut...
if s1 == s2:
return 1.0
# See if we can do it ascii-fashion (faster)
try:
return _cpp_ngram.ascii_compare(s1.encode("ascii"), s2.encode("ascii"))
except:
pass
if type(s1) != unicode:
s1 = s1.decode("utf8")
if type(s2) != unicode:
s2 = s2.decode("utf8")
# Call cpp function
return _cpp_ngram.unicode_compare(s1, s2)
def get_totals(s1, s2, n = 3):
"""
Returns (left_ngrams, right_ngrams, common_ngrams)
"""
# Ensure we have unicode strings (or python might crash)
# See if we can do it ascii-fashion (faster)
try:
return _cpp_ngram.ascii_get_totals(s1.encode("ascii"), s2.encode("ascii"))
except:
pass
if type(s1) != unicode:
s1 = s1.decode("utf8")
if type(s2) != unicode:
s2 = s2.decode("utf8")
# Call cpp function
return _cpp_ngram.unicode_get_totals(s1, s2)
%}
Event Timeline
Log In to Comment