解码器是基于短语的统计机器翻译系统的核心模块

源代码在线查看: vocab.cpp

软件大小: 83 K
上传用户: karon9999
关键词: 解码器 机器翻译系统 核心 模块
下载地址: 免注册下载 普通下载 VIP

相关代码

				#include "Vocab.h"
				
				int Vocab::ID = -1;
				
				using namespace std;
				
				Vocab::Vocab()
				{
					indexWord.push_back("");
				}
				
				bool Vocab::load(string vcb)
				{
					string strTmp;
					string sentStart = "";
					string sentEnd = "";
				
					vcbFileName.open(vcb.c_str(), std::ios::in);
					if (!vcbFileName) 
					{
						cout 						return 0;
					}
					while (getline(vcbFileName, strTmp))
					{
						int index;
						int firstTab, nextTab;
						firstTab = strTmp.find_first_of('\t');
						nextTab = strTmp.find_last_of('\t');
						string id(strTmp, 0, firstTab);
						string word(strTmp, firstTab + 1, nextTab - firstTab -1);
				
						index = atoi(id.c_str());
						indexWord.push_back(word);
						wordIndex.insert(make_pair(word, index));
					}
					vcbFileName.clear();
					vcbFileName.close();
					int len = indexWord.size();
					indexWord.push_back(sentStart);
					wordIndex.insert(make_pair(sentStart, len));
					indexWord.push_back(sentEnd);
					wordIndex.insert(make_pair(sentEnd, len + 1));
					return 1;
				}
				
				int Vocab::getIndex(string word)
				{
					WordIndex::iterator pos = wordIndex.find(word);
					if (pos != wordIndex.end()) {
						return pos->second;
					}
					return 1; //UNK
				}
				
				string Vocab::getWord(int index)
				{
					string str;
					if ( (index > 0)) {//(index < MaxIndex) &&
						str = indexWord[index];
					}
					else
					{
						str = unkTMP[index];
					}
					return str;
				}
				
				int Vocab::getIndices(string words, vector& indices)
				{
					vector vectmp;
					split(words, vectmp);
					int len = vectmp.size();
					for(int i = 0; i < len; i++)
					{
						int fdsa = wordIndex.size();
						WordIndex::iterator pos = wordIndex.find(vectmp[i]);
						if (pos != wordIndex.end()) {
							indices.push_back(pos->second);
						}
						else {
							int idex = indexWord.size();
							indexWord.push_back(vectmp[i]);
							wordIndex.insert(make_pair(vectmp[i], idex));
							indices.push_back(idex);
						}
					}
					return len;
				}
				
				string Vocab::getWords(vector indices)
				{
					string str;
					int len = indices.size();
					for(int i = 0; i < len; i++)
					{
						int tm = indices[i];
						if ((tm > 0)) {//(tm < MaxIndex) && 
							str += indexWord[tm];
							str += " ";
						}
						else 
						{
							str += unkTMP[tm];
							str += " ";
						}
					}
					string::iterator pos = str.end() - 1;	
					str.erase(pos);
					return str;
				}
				
				int Vocab::senToIDs(string sen, vector& senids)//将输入句子转化为ID
				{
					vector vectmp;
					split(sen, vectmp);
					int len = vectmp.size();
					for(int i = 0; i < len; i++)
					{
						string str = vectmp[i];
						WordIndex::iterator pos = wordIndex.find(vectmp[i]);
						if (pos != wordIndex.end()) 
						{
							senids.push_back(wordIndex[vectmp[i]]);
						}
						else
						{
							int idfake = ID--;
							senids.push_back(idfake);
							unkTMP.insert(make_pair(idfake, vectmp[i]));
						}
					}
					return len;
				}
				
				string Vocab::IDsTosen(deque senids)//将ID转换为string
				{
					string str;
					int len = senids.size();
					for(int i = 0; i < len; i++)
					{
						if ((senids[i] > 0))//(senids[i] < MaxIndex) && 
						{
							str += indexWord[senids[i]];
							str += " ";
						}
						else
						{
							str += unkTMP[senids[i]];
							str += " ";
						}
					}
					string::iterator pos = str.end() - 1;	
					str.erase(pos);
					return str;
				}
				
				void Vocab::mapClear()
				{
					unkTMP.clear();
				}
				
				
				
				
				
				
				
							

相关资源