crf预处理更改

技术2022-05-20 27

//********************test******************************

#include <stdio.h>#include <stdlib.h>#include <iostream>#include <vector>#include <string>

#define C_NUMBER ("一二三四五六七八九十百千万亿壹贰叁肆伍陆柒捌玖拾佰仟")#define E_NUMBER ("１２３４５６７８９０")#define E_ENGLISH ("ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ")#define PUNCTION ("。，、；：？！ “ ” ‘ ’╗ ╚ ┐ └ （） … … — — — 《》〈〉 · .")#define E_PUNCTION (". , ; : ? ! /" ' ( ) < >")#define E_NUMBER ("1 2 3 4 5 6 7 8 9 0")

#define TEST_LINE_NUM 5000#define MAXLINELEN 1024

using namespace std;

int get_test(const char *src,const char *dst);int get_test_open_tst(const char *src2,const char *dst2);void chomp(char *srcline);void from_seg_to_tag(const char *line,vector<string> &dst_line);void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data);

int main(int argc,char **argv){/* char src[] = "data.txt"; char dst[] = "test.txt"; char src2[] = "news.txt"; char dst2[] = "test2.txt";

get_test(src,dst);*/ if (argc != 3) { printf("usage: %s in_file out_file/n",argv[0]); return(1); } get_test_open_tst(argv[1],argv[2]); //get_test_open_tst("1","2");

return(1);}

/*get test from training data for close test * */int get_test(const char *src,const char *dst){ FILE *fin,*fout,*fout2; char line[MAXLINELEN]; char first[64],second[64],third[64],four[64]; int num;

fin = fopen(src,"rb"); if(NULL == fin) { printf("can't open %s/n",src); return(-1); } fout = fopen(dst,"wb");

sprintf(first,"%s.tst",dst); fout2 = fopen(first,"wb"); num = 0; while(!feof(fin)) { fgets(line,MAXLINELEN,fin); if(line[0] == 0x0A || line[0] == 0x0D) { fprintf(fout,"%s",line); fprintf(fout2,"%s",line); }else { fprintf(fout2,"%s",line); sscanf(line,"%s %s %s %s",first,second,third,four); fprintf(fout,"%s %s %s/n",first,second,third); } num++; if(num == MAXLINELEN) break; } fclose(fin); fclose(fout); fclose(fout2); return(1);}

void chomp(char *srcline){ int n; for(n = 0;srcline[n];n++) { if(srcline[n] == 0x0D || srcline[n] == 0x0A) break; } srcline[n] = 0; return;}

bool ValidColumn(const string &str){ size_t n,num; num = 0; for(n = 0;n < str.size();) { if(str[n] == ' ') { while(n < str.size() && str[n] == ' ') n++; num++; } else { n++; } } if(num != 3) return(false); else return(true);}

/* * 开放测试文本转换 * */int get_test_open_tst(const char *in_file,const char *out_file){ FILE *fin,*fout; char line[MAXLINELEN];

fin = fopen(in_file,"rb"); if(NULL == fin) { printf("can't open %s/n",in_file); return(-1); } fout = fopen(out_file,"wb"); while(!feof(fin)) { fgets(line,MAXLINELEN,fin); chomp(line); vector<string> v_dst,v_data; from_seg_to_tag(line,v_dst); from_tag_to_data(v_dst,v_data);

for(size_t n = 0;n < v_data.size();n++) { if((n+1) < v_data.size()) { if(v_data[n] == " " && v_data[n] == v_data[n+1]) continue; else if(v_data[n] == "/n" && v_data[n] == v_data[n+1]) continue; else fprintf(fout,"%s/n",v_data[n].c_str()); } else { fprintf(fout,"%s/n",v_data[n].c_str()); } } fprintf(fout,"/n"); } fclose(fin); fclose(fout); return(1);}

bool IsEnglish(const string &word_cur){ if(strstr(E_ENGLISH,word_cur.c_str())) return(true); if (isalpha(word_cur[0])) return(true); return(false);}bool IsNumber(const string &word_cur){ if(strstr(C_NUMBER,word_cur.c_str())) return(true);

return(false);}bool IsEnumber(const string &word_cur){ /*English number*/ if(strstr(E_NUMBER,word_cur.c_str())) return(true); if (isdigit(word_cur[0])) return(true); return(false);}

bool IsPunc(const string &word_cur){ if(strstr(PUNCTION,word_cur.c_str())) return(true); else if(strstr(E_PUNCTION,word_cur.c_str())) return(true); else return(false);}

void split_word_tgt(string &word,string &tgt,const string &src){ size_t n; word = ""; tgt = ""; for(n = 0;n < src.length() && src[n] != '/';n++) { word += src[n]; } if(src[n] == '/') { n++; for(;n < src.length() ;n++) {

if(src[n] != ' ') tgt += src[n]; } } else { printf("error in split %s/n",src.c_str()); } return;}/* 给每个字附着属性信息，如标点、数字 */void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data){ size_t n,size;

size = v_dst.size(); for(n = 0;n < size;n++) { string word0,word2,word1,tgt,tmp = ""; if(v_dst[n] == "　") { v_data.push_back(" "); continue; } word1 = v_dst[n]; tmp = word1; tmp += " "; //punctuation if(IsPunc(word1) == true) tmp += "y_punc"; else tmp += "n_punc"; tmp += " "; //number if(IsNumber(word1) == true) tmp += "C_num";//中文数字 else if (IsEnumber(word1) == true) tmp += "A_num"; //阿拉伯数字 else if (IsEnglish(word1) == true) tmp += "E_num";//英文 else tmp += "N_num";//其他 v_data.push_back(tmp); } return;}

const char *split_char_str(const char *line,vector<string> &array){ const char *pline = line; if(*pline > 0) { string tmp = ""; while(*pline && *pline > 0 && *pline != ' ')//中英文界限 { tmp += *pline; pline++; } if(tmp != "") array.push_back(tmp); } else { string tmp; tmp = *pline; tmp += *(pline+1); array.push_back(tmp); pline += 2; } return(pline);}

void from_seg_to_tag(const char *line,vector<string> &v_dst){ const char *pline = line; while(*pline) { if(*pline != ' ') { vector<string> array; pline = split_char_str(pline,array); string dst_line ; if(array.size() != 0) { dst_line = array[0]; v_dst.push_back(dst_line); } } else { pline++; } } return;}//***********************train*******************************

#include <stdio.h>#include <ctype.h>#include <string.h>#include <iostream>#include <string>#include <vector>

#define MAXLINELEN 1024*5#define C_NUMBER ("一二三四五六七八九十百千万亿壹贰叁肆伍陆柒捌玖拾佰仟")#define E_NUMBER ("１２３４５６７８９０")#define E_ENGLISH ("ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚ")#define PUNCTION ("。，、；：？！ “ ” ‘ ’╗ ╚ ┐ └ （） … … — — — 《》〈〉 · .")#define E_PUNCTION (". , ; : ? ! /" ' ( ) < >")

using namespace std;

int trans_file(const char *in_file,const char *out_file);int chomp(char *srcline);void from_seg_to_tag(const char *line,vector<string> &dst_line);void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data);

int main(int argc,char **argv){ if (argc != 3) { printf("usage:%s in_file out_file/n", argv[0]); return(1); } trans_file(argv[1],argv[2]); return(1);}

int chomp(char *srcline){ int n; for(n = 0;srcline[n];n++) { if(srcline[n] == 0x0D || srcline[n] == 0x0A) break; } srcline[n] = 0; return(n);}

int trans_file(const char *in_file,const char *out_file){ FILE *fin,*fout; char line[MAXLINELEN];

fin = fopen(in_file,"rb"); if(NULL == fin) { printf("can't open %s/n",in_file); return(-1); } fout = fopen(out_file,"wb"); while(!feof(fin)) { fgets(line,MAXLINELEN,fin); if(chomp(line) < 2) continue; vector<string> v_dst,v_data; from_seg_to_tag(line,v_dst); from_tag_to_data(v_dst,v_data);

for(size_t n = 0;n < v_data.size();n++) { if(ValidColumn(v_data[n].c_str()) == true) fprintf(fout,"%s/n",v_data[n].c_str()); else printf("column size error =%s/n",v_data[n].c_str()); } fprintf(fout,"/n"); } fclose(fin); fclose(fout); return(1);}bool IsEnglish(const string &word_cur){ if(strstr(E_ENGLISH,word_cur.c_str())) return(true); if (isalpha(word_cur[0])) return(true); return(false);}bool IsNumber(const string &word_cur){ if(strstr(C_NUMBER,word_cur.c_str())) return(true);

return(false);}bool IsEnumber(const string &word_cur){ /*English number*/

if(strstr(E_NUMBER,word_cur.c_str())) return(true); if (isdigit(word_cur[0])) return(true); return(false);}

bool IsPunc(const string &word_cur){ if(strstr(PUNCTION,word_cur.c_str())) return(true); else if(strstr(E_PUNCTION,word_cur.c_str())) return(true); else return(false);}void split_word_tgt(string &word,string &tgt,const string &src){ size_t n; word = ""; tgt = ""; for(n = 0;n < src.length() && src[n] != '/';n++) { word += src[n]; } if(src[n] == '/') { n++; for(;n < src.length() ;n++) { if(src[n] != ' ') tgt += src[n]; } } else { printf("error in split %s/n",src.c_str()); } return;}

/*给字附着属性信息*/void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data){ size_t n,size;

size = v_dst.size(); for(n = 0;n < size;n++) { string word0,word2,word1,tgt,tmp = ""; split_word_tgt(word1,tgt,v_dst[n]); tmp = word1; tmp += " "; //punctuation if(IsPunc(word1) == true) tmp += "y_punc"; else tmp += "n_punc"; tmp += " "; //number if(IsNumber(word1) == true) tmp += "C_num";//中文数字 else if (IsEnumber(word1) == true) tmp += "A_num"; //阿拉伯数字 else if (IsEnglish(word1) == true) tmp += "E_num";//英文 else tmp += "N_num";//其他 tmp += " "; tmp += tgt; v_data.push_back(tmp); } return;}

const char *split_char_str(const char *line,vector<string> &array){ const char *pline = line; while(*pline && *pline != ' ') { if(*pline > 0) { string tmp = ""; while(*pline && *pline > 0 && *pline != ' ') { tmp += *pline; pline++; } if(tmp != "") array.push_back(tmp); } else { string tmp; tmp = *pline; tmp += *(pline+1); array.push_back(tmp); pline += 2; } } return(pline);}

void from_seg_to_tag(const char *line,vector<string> &v_dst){ const char *pline = line; while(*pline) { if(*pline != ' ') { vector<string> array; pline = split_char_str(pline,array); if(array.size() == 0) { } else if(array.size() == 1) { string dst_line ; dst_line = array[0]; dst_line += "/S "; v_dst.push_back(dst_line); } else if(array.size() == 2) { string dst_line ; dst_line = array[0]; dst_line += "/B "; v_dst.push_back(dst_line); dst_line = array[1]; dst_line += "/E "; v_dst.push_back(dst_line); } else if(array.size() == 3) { string dst_line ; dst_line = array[0]; dst_line += "/B "; v_dst.push_back(dst_line); dst_line = array[1]; dst_line += "/B2 "; v_dst.push_back(dst_line); dst_line = array[2]; dst_line += "/E "; v_dst.push_back(dst_line); } else if(array.size() >= 4) { string dst_line ; dst_line = array[0]; dst_line += "/B "; v_dst.push_back(dst_line); dst_line = array[1]; dst_line += "/B2 "; v_dst.push_back(dst_line); dst_line = array[2]; dst_line += "/B3 "; v_dst.push_back(dst_line); for(size_t n = 3;n < (array.size()-1);n++) { dst_line = array[n]; dst_line += "/M "; v_dst.push_back(dst_line); } dst_line = array[array.size()-1]; dst_line += "/E "; v_dst.push_back(dst_line); } } else { pline++; } } return;}

专利

最新回复(0)