crf预处理更改

    技术2022-05-20  27

    //********************test******************************

    #include <stdio.h>#include <stdlib.h>#include <iostream>#include <vector>#include <string>

    #define C_NUMBER ("一 二 三 四 五 六 七 八 九 十 百 千 万 亿 壹 贰 叁 肆 伍 陆 柒 捌 玖 拾 佰 仟")#define E_NUMBER ("1 2 3 4 5 6 7 8 9 0")#define E_ENGLISH ("A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z")#define PUNCTION ("。 , 、 ; : ? ! “ ” ‘  ’╗ ╚ ┐ └ (  ) … … — — —  《  》  〈  〉 · .")#define E_PUNCTION (". , ; : ? ! /" ' ( ) < >")#define E_NUMBER ("1 2 3 4 5 6 7 8 9 0")

    #define TEST_LINE_NUM 5000#define MAXLINELEN 1024

    using namespace std;

    int get_test(const char *src,const char *dst);int get_test_open_tst(const char *src2,const char *dst2);void chomp(char *srcline);void from_seg_to_tag(const char *line,vector<string> &dst_line);void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data);

    int main(int argc,char **argv){/* char src[] = "data.txt"; char dst[] = "test.txt";  char src2[] = "news.txt"; char dst2[] = "test2.txt";

     get_test(src,dst);*/ if (argc != 3) {  printf("usage: %s in_file out_file/n",argv[0]);  return(1); }  get_test_open_tst(argv[1],argv[2]);  //get_test_open_tst("1","2"); 

     return(1);}

    /*get test from training data for close test * */int get_test(const char *src,const char *dst){ FILE *fin,*fout,*fout2; char line[MAXLINELEN]; char first[64],second[64],third[64],four[64];  int num;

     fin = fopen(src,"rb"); if(NULL == fin) {  printf("can't open %s/n",src);  return(-1); } fout = fopen(dst,"wb");

     sprintf(first,"%s.tst",dst);  fout2 = fopen(first,"wb"); num = 0;  while(!feof(fin)) {  fgets(line,MAXLINELEN,fin);  if(line[0] == 0x0A || line[0] == 0x0D)  {   fprintf(fout,"%s",line);   fprintf(fout2,"%s",line);  }else  {    fprintf(fout2,"%s",line);   sscanf(line,"%s %s %s %s",first,second,third,four);   fprintf(fout,"%s %s %s/n",first,second,third);  }  num++;  if(num == MAXLINELEN)   break; } fclose(fin); fclose(fout); fclose(fout2); return(1);}

    void chomp(char *srcline){ int n; for(n = 0;srcline[n];n++) {  if(srcline[n] == 0x0D || srcline[n] == 0x0A)   break; } srcline[n] = 0; return;}

    bool ValidColumn(const string &str){ size_t n,num;  num = 0; for(n = 0;n < str.size();) {  if(str[n] == ' ')  {   while(n < str.size() && str[n] == ' ')    n++;   num++;  }  else  {   n++;  } } if(num != 3)  return(false); else  return(true);}

    /* * 开放测试文本转换 * */int get_test_open_tst(const char *in_file,const char *out_file){ FILE *fin,*fout; char line[MAXLINELEN];

     fin = fopen(in_file,"rb"); if(NULL == fin) {  printf("can't open %s/n",in_file);  return(-1); } fout = fopen(out_file,"wb"); while(!feof(fin)) {  fgets(line,MAXLINELEN,fin);  chomp(line);      vector<string> v_dst,v_data;     from_seg_to_tag(line,v_dst);  from_tag_to_data(v_dst,v_data);

      for(size_t n = 0;n < v_data.size();n++)  {   if((n+1) < v_data.size())   {    if(v_data[n] == " " && v_data[n] == v_data[n+1])     continue;    else if(v_data[n] == "/n" && v_data[n] == v_data[n+1])     continue;    else     fprintf(fout,"%s/n",v_data[n].c_str());   }   else   {    fprintf(fout,"%s/n",v_data[n].c_str());   }  }  fprintf(fout,"/n"); } fclose(fin); fclose(fout); return(1);}

    bool IsEnglish(const string &word_cur){ if(strstr(E_ENGLISH,word_cur.c_str()))  return(true); if (isalpha(word_cur[0]))  return(true);  return(false);}bool IsNumber(const string &word_cur){ if(strstr(C_NUMBER,word_cur.c_str()))  return(true); 

     return(false);}bool IsEnumber(const string &word_cur){  /*English number*/   if(strstr(E_NUMBER,word_cur.c_str()))  return(true); if (isdigit(word_cur[0]))  return(true);  return(false);}

    bool IsPunc(const string &word_cur){ if(strstr(PUNCTION,word_cur.c_str()))  return(true); else if(strstr(E_PUNCTION,word_cur.c_str()))  return(true); else  return(false);}

    void split_word_tgt(string &word,string &tgt,const string &src){ size_t n;  word = ""; tgt = ""; for(n = 0;n < src.length() && src[n] != '/';n++) {  word += src[n]; } if(src[n] == '/') {  n++;  for(;n < src.length() ;n++)  {

       if(src[n] != ' ')    tgt += src[n];  } } else {  printf("error in split %s/n",src.c_str()); } return;}/* 给每个字附着属性信息,如标点、数字 */void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data){ size_t n,size;

     size = v_dst.size(); for(n = 0;n < size;n++) {  string word0,word2,word1,tgt,tmp = "";   if(v_dst[n] == " ")  {   v_data.push_back(" ");   continue;  }  word1 = v_dst[n];   tmp = word1;   tmp += " ";     //punctuation   if(IsPunc(word1) == true)   tmp += "y_punc";  else     tmp += "n_punc";  tmp += " ";       //number    if(IsNumber(word1) == true)   tmp += "C_num";//中文数字  else if (IsEnumber(word1) == true)   tmp += "A_num"; //阿拉伯数字  else if (IsEnglish(word1) == true)   tmp += "E_num";//英文  else   tmp += "N_num";//其他     v_data.push_back(tmp); } return;}

    const char *split_char_str(const char *line,vector<string> &array){ const char *pline = line;  if(*pline > 0) {  string tmp = "";  while(*pline && *pline > 0 && *pline != ' ')//中英文界限  {   tmp += *pline;   pline++;  }  if(tmp != "")   array.push_back(tmp); } else {  string tmp;  tmp = *pline;  tmp += *(pline+1);  array.push_back(tmp);  pline += 2; } return(pline);}

    void from_seg_to_tag(const char *line,vector<string> &v_dst){ const char *pline = line; while(*pline) {  if(*pline != ' ')  {   vector<string> array;   pline = split_char_str(pline,array);   string dst_line ;      if(array.size() != 0)   {    dst_line = array[0];     v_dst.push_back(dst_line);    }  }  else  {   pline++;   } } return;}//***********************train*******************************

    #include <stdio.h>#include <ctype.h>#include <string.h>#include <iostream>#include <string>#include <vector>

    #define MAXLINELEN 1024*5#define C_NUMBER ("一 二 三 四 五 六 七 八 九 十 百 千 万 亿 壹 贰 叁 肆 伍 陆 柒 捌 玖 拾 佰 仟")#define E_NUMBER ("1 2 3 4 5 6 7 8 9 0")#define E_ENGLISH ("A B C D E F G H I J K L M N O P Q R S T U V W X Y Z a b c d e f g h i j k l m n o p q r s t u v w x y z")#define PUNCTION ("。 , 、 ; : ? ! “ ” ‘  ’╗ ╚ ┐ └ (  ) … … — — —  《  》  〈  〉 · .")#define E_PUNCTION (". , ; : ? ! /" ' ( ) < >")

    using namespace std;

    int trans_file(const char *in_file,const char *out_file);int  chomp(char *srcline);void from_seg_to_tag(const char *line,vector<string> &dst_line);void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data);

    int main(int argc,char **argv){ if (argc != 3) {  printf("usage:%s in_file out_file/n", argv[0]);  return(1); }  trans_file(argv[1],argv[2]); return(1);}

    int chomp(char *srcline){ int n; for(n = 0;srcline[n];n++) {  if(srcline[n] == 0x0D || srcline[n] == 0x0A)   break; } srcline[n] = 0; return(n);}

    bool ValidColumn(const string &str){ size_t n,num;  num = 0; for(n = 0;n < str.size();) {  if(str[n] == ' ')  {   while(n < str.size() && str[n] == ' ')    n++;   num++;  }  else  {   n++;  } } if(num != 3)  return(false); else  return(true);}

    int trans_file(const char *in_file,const char *out_file){ FILE *fin,*fout; char line[MAXLINELEN];

     fin = fopen(in_file,"rb"); if(NULL == fin) {  printf("can't open %s/n",in_file);  return(-1); } fout = fopen(out_file,"wb"); while(!feof(fin)) {  fgets(line,MAXLINELEN,fin);  if(chomp(line) < 2)     continue;   vector<string> v_dst,v_data;     from_seg_to_tag(line,v_dst);  from_tag_to_data(v_dst,v_data);

      for(size_t n = 0;n < v_data.size();n++)  {   if(ValidColumn(v_data[n].c_str()) == true)    fprintf(fout,"%s/n",v_data[n].c_str());   else    printf("column size error =%s/n",v_data[n].c_str());  }  fprintf(fout,"/n"); } fclose(fin); fclose(fout); return(1);}bool IsEnglish(const string &word_cur){ if(strstr(E_ENGLISH,word_cur.c_str()))  return(true); if (isalpha(word_cur[0]))  return(true);  return(false);}bool IsNumber(const string &word_cur){ if(strstr(C_NUMBER,word_cur.c_str()))  return(true); 

     return(false);}bool IsEnumber(const string &word_cur){  /*English number*/ 

     if(strstr(E_NUMBER,word_cur.c_str()))  return(true); if (isdigit(word_cur[0]))  return(true);  return(false);}

    bool IsPunc(const string &word_cur){ if(strstr(PUNCTION,word_cur.c_str()))  return(true); else if(strstr(E_PUNCTION,word_cur.c_str()))  return(true); else  return(false);}void split_word_tgt(string &word,string &tgt,const string &src){ size_t n;  word = ""; tgt = ""; for(n = 0;n < src.length() && src[n] != '/';n++) {  word += src[n]; } if(src[n] == '/') {  n++;  for(;n < src.length() ;n++)  {   if(src[n] != ' ')    tgt += src[n];  } } else {  printf("error in split %s/n",src.c_str()); } return;}

    /*给字附着属性信息*/void from_tag_to_data(vector<string> &v_dst,vector<string> &v_data){ size_t n,size;

     size = v_dst.size(); for(n = 0;n < size;n++) {  string word0,word2,word1,tgt,tmp = "";    split_word_tgt(word1,tgt,v_dst[n]);      tmp = word1;   tmp += " ";     //punctuation   if(IsPunc(word1) == true)   tmp += "y_punc";  else     tmp += "n_punc";  tmp += " ";       //number   if(IsNumber(word1) == true)   tmp += "C_num";//中文数字  else if (IsEnumber(word1) == true)   tmp += "A_num"; //阿拉伯数字  else if (IsEnglish(word1) == true)   tmp += "E_num";//英文  else   tmp += "N_num";//其他     tmp += " ";     tmp += tgt;    v_data.push_back(tmp); } return;}

    const char *split_char_str(const char *line,vector<string> &array){ const char *pline = line;  while(*pline && *pline != ' ') {  if(*pline > 0)  {   string tmp = "";   while(*pline && *pline > 0 && *pline != ' ')   {    tmp += *pline;    pline++;   }   if(tmp != "")    array.push_back(tmp);  }  else  {   string tmp;   tmp = *pline;   tmp += *(pline+1);   array.push_back(tmp);   pline += 2;  } }  return(pline);}

    void from_seg_to_tag(const char *line,vector<string> &v_dst){ const char *pline = line; while(*pline) {  if(*pline != ' ')  {   vector<string> array;   pline = split_char_str(pline,array);   if(array.size() == 0)   {   }   else if(array.size() == 1)   {    string dst_line ;        dst_line = array[0];     dst_line += "/S ";     v_dst.push_back(dst_line);    }   else if(array.size() == 2)   {    string dst_line ;        dst_line = array[0];     dst_line += "/B ";     v_dst.push_back(dst_line);     dst_line = array[1];     dst_line += "/E ";      v_dst.push_back(dst_line);    }   else if(array.size() == 3)   {    string dst_line ;        dst_line = array[0];     dst_line += "/B ";     v_dst.push_back(dst_line);     dst_line = array[1];     dst_line += "/B2 ";     v_dst.push_back(dst_line);     dst_line = array[2];     dst_line += "/E ";      v_dst.push_back(dst_line);    }   else if(array.size() >= 4)   {    string dst_line ;        dst_line = array[0];     dst_line += "/B ";     v_dst.push_back(dst_line);     dst_line = array[1];     dst_line += "/B2 ";     v_dst.push_back(dst_line);     dst_line = array[2];     dst_line += "/B3 ";     v_dst.push_back(dst_line);     for(size_t n = 3;n < (array.size()-1);n++)     {     dst_line = array[n];      dst_line += "/M ";      v_dst.push_back(dst_line);     }     dst_line = array[array.size()-1];     dst_line += "/E ";      v_dst.push_back(dst_line);    }  }  else  {   pline++;   } } return;}


    最新回复(0)