/////////////////////////////////////////////////////////////////////////////////
//
// Sentence matcher
// @author: Gonzales Cenelia
// homepage: www.ai-search.4t.com
//
// when applied to strings,the "Levenstein distance" is usualy used
// to measure the distances between words but we could also use it to measure
// the distance between sentences,the following code is a simple implementation
// of an algorithm that trys to measure the distances between two sentences
// by using the "Levenstein distance". //********************
//
// this code is copyrighted and has limited warranty. //***************
//////////////////////////////////////////////////////////////////////////
#include "matcher.h"
unsigned int tokenized_pattern_size;
unsigned int tokenized_text_size;
// tokenize a string and puts the result into a vector
void tokenize_string(const char *str, vstring &dest) {
int len = strlen(str);
char *szString = new char[len + 1];
strcpy(szString, str);
char *token = strtok(szString, seps);
while(token != NULL) {
dest.push_back(token);
token = strtok(NULL, seps);
}
delete szString;
}
// search for a string inside a vector of strings
int search_string(const char *str, vstring v) {
int size = v.size();
for(int i = 0; i < size; ++i) {
if(v[i] == str) {
return i;
}
}
return -1;
}
// compress sentences by replacing each word
// by characters of the english alphabet
void compress_strings(char *pattern, char *text) {
vstring v1, v2, temp;
tokenize_string(pattern, v1);
tokenize_string(text, v2);
std::stringstream s1, s2;
int size1 = v1.size();
int size2 = v2.size();
tokenized_pattern_size = size1;
tokenized_text_size = size2;
for(int i = 0, j = -1, prev_index; i < size1; ++i) {
if(search_string(v1[i].c_str(), temp) == -1) {
temp.push_back(v1[i].c_str());
++j; prev_index = j;
}
s1 }
int index = prev_index, counter = 0, result;
for(i = 0; i < size2; ++i) {
result = search_string(v2[i].c_str(), temp);
if(result == -1) {
temp.push_back(v2[i].c_str());
if(counter == 0) {
index = prev_index + 1;
++counter;
}
else {
++index;
}
}
else {
if(index > prev_index) {
prev_index = index;
counter = 0;
}
index = result;
}
s2 }
strcpy(pattern, s1.str().c_str());
strcpy(text, s2.str().c_str());
}
// measure the distance between two sentences by using
// the "Levenshtein Distance"
int find_distance(const char *pattern, const char *text) {
int patLen = strlen(pattern);
int textLen = strlen(text);
char *pszPattern = new char[patLen + 1];
char *pszText = new char[textLen + 1];
strcpy(pszPattern, pattern);
strcpy(pszText, text);
// compress the given sentences
compress_strings(pszPattern, pszText);
patLen = strlen(pszPattern);
textLen = strlen(pszText);
// measure the distances between the new compressed strings
int retval = LD(pszText, textLen, pszPattern, patLen);
delete pszPattern;
delete pszText;
return retval;
}
// Fuzzy Sentence Match
// function for matching two sentences
// returns a value between 0 and 1
// the closer that value is to 0 the closer the sentences are
float match(const char *pattern, const char *text) {
int distance = find_distance(pattern, text);
float max_size = max(tokenized_pattern_size, tokenized_text_size);
return distance/max_size;
}