From 1b8d0a16b7df53aa0f43f046fe8fcc4b446df69e Mon Sep 17 00:00:00 2001 From: George Marques Date: Mon, 13 Jun 2016 14:06:03 -0300 Subject: Add similarity comparison to String Uses the Sorensen-Dice coefficient to calculate similarity. This also adds String.bigrams() as a convenience function needed by the comparison. --- core/ustring.cpp | 44 ++++++++++++++++++++++++++++++++++++++++++++ core/ustring.h | 2 ++ core/variant_call.cpp | 4 ++++ 3 files changed, 50 insertions(+) (limited to 'core') diff --git a/core/ustring.cpp b/core/ustring.cpp index 485f7f1b62..ea9a9d903e 100644 --- a/core/ustring.cpp +++ b/core/ustring.cpp @@ -2810,6 +2810,50 @@ bool String::_base_is_subsequence_of(const String& p_string, bool case_insensiti return false; } +Vector String::bigrams() const { + int n_pairs = length() - 1; + Vector b; + if(n_pairs <= 0) { + return b; + } + b.resize(n_pairs); + for(int i = 0; i < n_pairs; i++) { + b[i] = substr(i,2); + } + return b; +} + +// Similarity according to Sorensen-Dice coefficient +float String::similarity(const String& p_string) const { + if(operator==(p_string)) { + // Equal strings are totally similar + return 1.0f; + } + if (length() < 2 || p_string.length() < 2) { + // No way to calculate similarity without a single bigram + return 0.0f; + } + + Vector src_bigrams = bigrams(); + Vector tgt_bigrams = p_string.bigrams(); + + int src_size = src_bigrams.size(); + int tgt_size = tgt_bigrams.size(); + + float sum = src_size + tgt_size; + float inter = 0; + for (int i = 0; i < src_size; i++) { + for (int j = 0; j < tgt_size; j++) { + if (src_bigrams[i] == tgt_bigrams[j]) { + inter++; + break; + } + } + } + + return (2.0f * inter)/sum; +} + static bool _wildcard_match(const CharType* p_pattern, const CharType* p_string,bool p_case_sensitive) { switch (*p_pattern) { case '\0': diff --git a/core/ustring.h b/core/ustring.h index 8aceb0748c..692cb4e37d 100644 --- a/core/ustring.h +++ b/core/ustring.h @@ -123,6 +123,8 @@ public: bool ends_with(const String& p_string) const; bool is_subsequence_of(const String& p_string) const; bool is_subsequence_ofi(const String& p_string) const; + Vector bigrams() const; + float similarity(const String& p_string) const; String replace_first(String p_key,String p_with) const; String replace(String p_key,String p_with) const; String replacen(String p_key,String p_with) const; diff --git a/core/variant_call.cpp b/core/variant_call.cpp index 0055138582..683b1611d8 100644 --- a/core/variant_call.cpp +++ b/core/variant_call.cpp @@ -249,6 +249,8 @@ static void _call_##m_type##_##m_method(Variant& r_ret,Variant& p_self,const Var VCALL_LOCALMEM1R(String,ends_with); VCALL_LOCALMEM1R(String,is_subsequence_of); VCALL_LOCALMEM1R(String,is_subsequence_ofi); + VCALL_LOCALMEM0R(String,bigrams); + VCALL_LOCALMEM1R(String,similarity); VCALL_LOCALMEM2R(String,replace); VCALL_LOCALMEM2R(String,replacen); VCALL_LOCALMEM2R(String,insert); @@ -1274,6 +1276,8 @@ _VariantCall::addfunc(Variant::m_vtype,Variant::m_ret,_SCS(#m_method),VCALL(m_cl ADDFUNC1(STRING,BOOL,String,ends_with,STRING,"text",varray()); ADDFUNC1(STRING,BOOL,String,is_subsequence_of,STRING,"text",varray()); ADDFUNC1(STRING,BOOL,String,is_subsequence_ofi,STRING,"text",varray()); + ADDFUNC0(STRING,STRING_ARRAY,String,bigrams,varray()); + ADDFUNC1(STRING,REAL,String,similarity,STRING,"text",varray()); ADDFUNC2(STRING,STRING,String,replace,STRING,"what",STRING,"forwhat",varray()); ADDFUNC2(STRING,STRING,String,replacen,STRING,"what",STRING,"forwhat",varray()); -- cgit v1.2.3