解码器是基于短语的统计机器翻译系统的核心模块
源代码在线查看: vocab.cpp
#include "Vocab.h"
int Vocab::ID = -1;
using namespace std;
Vocab::Vocab()
{
indexWord.push_back("");
}
bool Vocab::load(string vcb)
{
string strTmp;
string sentStart = "";
string sentEnd = "";
vcbFileName.open(vcb.c_str(), std::ios::in);
if (!vcbFileName)
{
cout return 0;
}
while (getline(vcbFileName, strTmp))
{
int index;
int firstTab, nextTab;
firstTab = strTmp.find_first_of('\t');
nextTab = strTmp.find_last_of('\t');
string id(strTmp, 0, firstTab);
string word(strTmp, firstTab + 1, nextTab - firstTab -1);
index = atoi(id.c_str());
indexWord.push_back(word);
wordIndex.insert(make_pair(word, index));
}
vcbFileName.clear();
vcbFileName.close();
int len = indexWord.size();
indexWord.push_back(sentStart);
wordIndex.insert(make_pair(sentStart, len));
indexWord.push_back(sentEnd);
wordIndex.insert(make_pair(sentEnd, len + 1));
return 1;
}
int Vocab::getIndex(string word)
{
WordIndex::iterator pos = wordIndex.find(word);
if (pos != wordIndex.end()) {
return pos->second;
}
return 1; //UNK
}
string Vocab::getWord(int index)
{
string str;
if ( (index > 0)) {//(index < MaxIndex) &&
str = indexWord[index];
}
else
{
str = unkTMP[index];
}
return str;
}
int Vocab::getIndices(string words, vector& indices)
{
vector vectmp;
split(words, vectmp);
int len = vectmp.size();
for(int i = 0; i < len; i++)
{
int fdsa = wordIndex.size();
WordIndex::iterator pos = wordIndex.find(vectmp[i]);
if (pos != wordIndex.end()) {
indices.push_back(pos->second);
}
else {
int idex = indexWord.size();
indexWord.push_back(vectmp[i]);
wordIndex.insert(make_pair(vectmp[i], idex));
indices.push_back(idex);
}
}
return len;
}
string Vocab::getWords(vector indices)
{
string str;
int len = indices.size();
for(int i = 0; i < len; i++)
{
int tm = indices[i];
if ((tm > 0)) {//(tm < MaxIndex) &&
str += indexWord[tm];
str += " ";
}
else
{
str += unkTMP[tm];
str += " ";
}
}
string::iterator pos = str.end() - 1;
str.erase(pos);
return str;
}
int Vocab::senToIDs(string sen, vector& senids)//将输入句子转化为ID
{
vector vectmp;
split(sen, vectmp);
int len = vectmp.size();
for(int i = 0; i < len; i++)
{
string str = vectmp[i];
WordIndex::iterator pos = wordIndex.find(vectmp[i]);
if (pos != wordIndex.end())
{
senids.push_back(wordIndex[vectmp[i]]);
}
else
{
int idfake = ID--;
senids.push_back(idfake);
unkTMP.insert(make_pair(idfake, vectmp[i]));
}
}
return len;
}
string Vocab::IDsTosen(deque senids)//将ID转换为string
{
string str;
int len = senids.size();
for(int i = 0; i < len; i++)
{
if ((senids[i] > 0))//(senids[i] < MaxIndex) &&
{
str += indexWord[senids[i]];
str += " ";
}
else
{
str += unkTMP[senids[i]];
str += " ";
}
}
string::iterator pos = str.end() - 1;
str.erase(pos);
return str;
}
void Vocab::mapClear()
{
unkTMP.clear();
}