?? split.cpp

?? 自己寫的簡單分詞程序
?? CPP
字號:
#include "Split.h"

Split::Split()
{
	pStr = new string[100];
	pResult = new string[100];
	strEng = new string[200];
	nEngStrNum = new int[100];
	nLength = 0;
	index = 0;
	nEngLength = 0;
	for(int i = 0; i < 100; i++)
	{
		nEngStrNum[i] = 0;
	}
}

string* Split::SplitString(char *Source)
{
	char *pChTemp = new char[3];

	int iPos = 0;
	int index = 0;
	pChTemp[2] = 0;
	int i = 0;
	bool bEng = false;


	while(Source[iPos] != 0)
	{
		int j = iPos;

		pChTemp[0] = Source[j];

		if(Source[j] < 0)
		{
			if(bEng == true)
			{
				i++;
			}
			bEng = false;
			pChTemp[1] = Source[j+1];
			j = j+1;
			pStr[nLength] = pChTemp;
		}
		else
		{
			nEngStrNum[i] = nEngStrNum[i] + 1;//記錄連續出現的英文字符串的個數
			bEng = true;
			pChTemp[1] = 0;
			pStr[nLength] = pChTemp;
			while(1)
			{
				char c = ' ';
				if(memcmp(&Source[j+1],&c,1) == 0)
				{
					break;
				}
				
				if(Source[j+1] > 0)
				{
					pChTemp[0] = Source[j+1];
					pChTemp[1] = 0;
					j = j + 1;
					pStr[nLength] = pStr[nLength] + pChTemp;
				}
				else
				{	
					break;
				}
			}
			
		}

		j++;
		nLength++;
		iPos = j;
	}

	return pStr;
}

int Split::getLength()
{
	return this->nLength;
}

//正向最大匹配法
string* Split::Seg(vector<string> strArr)
{
	string strSource = "";
	bool   bIsEngString = false;
	int l = 0;
	
	for(int j = 0; j <= nLength;)
	{
		if(j == nLength)
		{
			if(strSource != " "&&strSource != "")
			{
				strSource = pStr[j];
				pResult[index] = strSource;
				cout<<strSource<<endl;
				break;
			}
			else
			{
				break;
			}
			
		}
		else
		{
			int i = j;
			
			//生成要進行匹配的字符串
			for(i; i < nLength; i++)
			{
				strSource = strSource + pStr[i];	
			}
			//////////////////////////////////////////////////////////////////////////
			
			while(1)
			{
				if(FindStrInDic(strSource,strArr) == false)
				{
					//如果只剩下一個字符，那么要判斷剩下的是兩個英文字符，還是一個中文字符
					if(strSource.length() == 2 && strSource[strSource.length()-1] < 0 )
					{
						if(strSource != " "&&strSource != "")
						{
							pResult[index] = strSource;
							cout<<strSource<<endl;
							index++;
							
						}
						break;
					}
					else if(strSource.length() == 1)
					{
						if(strSource != " "&&strSource != "")
						{
							pResult[index] = strSource;
							cout<<strSource<<endl;
							index++;
						
						}
						break;
						
					}
					//////////////////////////////////////////////////////////////////////////
					
					//判斷最后一個字符是英語字符還是中文字符,如果是strSource是一個英文字符串，則不再進行分詞,跳出此次循環
					if(strSource[strSource.length()-1] < 0)
					{
						strSource = strSource.substr(0,strSource.length()-2);
					}
					else
					{
						//判斷是否是一個英文字符串
						for(int n = 0; n < strSource.length(); n++)
						{
							if(strSource[n] < 0)
							{
								strSource = strSource.substr(0,strSource.length()-1);
								break;
							}
							if(n == strSource.length() - 1)
							{
								bIsEngString = true;
							}
						}
						//////////////////////////////////////////////////////////////////////////
						
						if(bIsEngString == true)
						{
							FindInterpunction(strSource);
							for(int k = 0; k < nEngLength; k++)
							{
								if(strSource != " "&&strSource != "")
								{
									pResult[index] = strEng[k];
									index++;
									cout<<strEng[k]<<endl;
									string str;
									strEng[k].erase();
								}
								
							}
							nEngLength = 0;
							break;
						}
						
					}
					/////////////////////////////////////////////////////////////////////////////
					
				}
				else
				{
					if(strSource != " "&&strSource != "")
					{
						pResult[index] = strSource;
						cout<<strSource<<endl;
						index++;
					}
					break;
				}
			
			}

			//判斷最后一個字符是英語字符還是中文字符,如果是一個英文字符串則把它看做一個詞
			if(bIsEngString == true)
			{
				j = j + nEngStrNum[l];
				l++;
				bIsEngString = false;
			}
			else if(strSource[strSource.length()-1] < 0)
			{
				j = j + strSource.length()/2;
			}
			else 
			{
				j = j + strSource.length();
			}
			//////////////////////////////////////////////////////////////////////////
			
			strSource = "";
		}	
	}
	
	return this->pResult;
}

bool Split::FindStrInDic(string str,vector<string> strArr)
{
	vector<string>::iterator it = find(strArr.begin(),strArr.end(),str);
	if(it != strArr.end())
	{
		return true;
	}
	else
	{
		return false;
	}
}

int Split::getIndex()
{
	return this->index;
}

//如果英文字符串中含有標點符號，則取出來
string* Split::FindInterpunction(string str)
{
	char Interpunction[] = ", . ? / < > ; : ' \" [ { ] } \\ | ! @ # $ % ^ & * ( )";
	char* Temp = new char[200];
	strcpy(Temp,str.c_str());
	char *token = strtok(Temp,Interpunction);
	int length = 0;//用來記錄標點符號的位置token中取出標點符號
	if(token == NULL)
	{
		strEng[nEngLength] = str;
		nEngLength++;
		return strEng;
	}
	else
	{
		while(token != NULL)
		{
			strEng[nEngLength] = token;
			nEngLength++;
			
			//第一次取標點符號時，由于前面沒有標點符號，則不需要加一，以后取出標點符號時，由于第
			//一個標點符號沒有算進token中，所以需要加一
			if(length == 0)
			{
				length = length + strlen(token);
			}
			else
			{
				length = length + strlen(token) + 1;
			}
			//////////////////////////////////////////////////////////////////////////
			
			if(str[length] != 0)
			{
				strEng[nEngLength] = str[length];
				nEngLength++;
			}
			
			token = strtok(NULL,Interpunction);
		}
	}
	return strEng;
	
}
?? 文件大小 1424 K
?? 上傳用戶 multicolor
?? 所屬分類多國語言處理
??? 相關標簽

#分 #程序
?? 快捷鍵說明

復制代碼 Ctrl + C
搜索代碼 Ctrl + F
全屏模式 F11
切換主題 Ctrl + Shift + D
顯示快捷鍵 ?
增大字號 Ctrl + =
減小字號 Ctrl + -
亚洲欧美第一页_禁久久精品乱码_粉嫩av一区二区三区免费野_久草精品视频

?? split.cpp

?? 快捷鍵說明