libsvm-svm-scale.c 源码分析

    技术2022-05-19  22

     

    #include <float.h>

    #include <stdio.h>

    #include <stdlib.h>

    #include <ctype.h>

    #include <string.h>

     

    void exit_with_help()

    {

    printf(

    "Usage: svm-scale [options] data_filename/n"

    "options:/n"

    "-l lower : x scaling lower limit (default -1)/n"

    "-u upper : x scaling upper limit (default +1)/n"

    "-y y_lower y_upper : y scaling limits (default: no y scaling)/n"

    "-s save_filename : save scaling parameters to save_filename/n"

    "-r restore_filename : restore scaling parameters from restore_filename/n"

    );

    exit(1);

    }

     

    char *line = NULL;  //用于读入每一个训练样本的数据

    int max_line_len = 1024;//存储数据文件的每一行最大为1024个字节

    double lower=-1.0,upper=1.0,y_lower,y_upper;//lower,upper要缩放到目标范围。y_lower,y_upper未知???

    int y_scaling = 0;//????

    double *feature_max;

    double *feature_min;

    double y_max = -DBL_MAX;//和y_scaling有关

    double y_min = DBL_MAX;

    int max_index;//遍历所有样本,最大的属性个数

    long int num_nonzeros = 0;

    long int new_num_nonzeros = 0;

     

    #define max(x,y) (((x)>(y))?(x):(y))

    #define min(x,y) (((x)<(y))?(x):(y))

     

    void output_target(double value);//和y_scaling有关???

    void output(int index, double value);//对一个样本的一个属性数据(index:value)进行缩放,并且对空白数据(index:0值的数据)

    char* readline(FILE *input);

     

    int main(int argc,char **argv)

    {

    int i,index;

    FILE *fp, *fp_restore = NULL;

    char *save_filename = NULL;

    char *restore_filename = NULL;

     

    for(i=1;i<argc;i++)

    {

    if(argv[i][0] != '-') break;//用户没有指定-l, -u, -y, -s, -r的值

    ++i;

    switch(argv[i-1][1])

    {

    case 'l': lower = atof(argv[i]); break;

    case 'u': upper = atof(argv[i]); break;

    case 'y':

    y_lower = atof(argv[i]);

    ++i;

    y_upper = atof(argv[i]);

    y_scaling = 1;

    break;

    case 's': save_filename = argv[i]; break;

    case 'r': restore_filename = argv[i]; break;

    default:

    fprintf(stderr,"unknown option/n");

    exit_with_help();

    }

    }

     

    if(!(upper > lower) || (y_scaling && !(y_upper > y_lower)))

    {

    fprintf(stderr,"inconsistent lower/upper specification/n");

    exit(1);

    }

    if(restore_filename && save_filename)

    {

    fprintf(stderr,"cannot use -r and -s simultaneously/n");

    exit(1);

    }

     

    if(argc != i+1) 

    exit_with_help();

     

    fp=fopen(argv[i],"r");//读入待缩放文件

    if(fp==NULL)

    {

    fprintf(stderr,"can't open file %s/n", argv[i]);

    exit(1);

    }

     

    line = (char *) malloc(max_line_len*sizeof(char));//开辟1024的空间

     

    #define SKIP_TARGET/

    while(isspace(*p)) ++p;/

    while(!isspace(*p)) ++p;

     

    #define SKIP_ELEMENT/

    while(*p!=':') ++p;/

    ++p;/

    while(isspace(*p)) ++p;/

    while(*p && !isspace(*p)) ++p;

    /* assumption: min index of attributes is 1 */               //注意:这里默认最小index为1,不是0!!!

    /* pass 1: find out max index of attributes */

    max_index = 0;

     

    if(restore_filename)                   //如果指定-r参数,则从其后面指定的文件中找到最大的属性个数

    {

    int idx, c;

     

    fp_restore = fopen(restore_filename,"r");

    if(fp_restore==NULL)

    {

    fprintf(stderr,"can't open file %s/n", restore_filename);

    exit(1);

    }

     

    c = fgetc(fp_restore);

    if(c == 'y')

    {

    readline(fp_restore);

    readline(fp_restore);

    readline(fp_restore);

    }

    readline(fp_restore);

    readline(fp_restore);

     

    while(fscanf(fp_restore,"%d %*f %*f/n",&idx) == 1)

    max_index = max(idx,max_index);                 //和现有值比较

    rewind(fp_restore);

    }

     

    while(readline(fp)!=NULL) //待缩放文件中读入一个样本数据

    {

    char *p=line;

     

    SKIP_TARGET

     

    while(sscanf(p,"%d:%*f",&index)==1) //此行数据中不断读入index值(注意有可能出现跳跃现象,如: 1:value1 3:value3 )

    {

    max_index = max(max_index, index); //求出最大的index值,存于max_index,主要为了feature_max,feature_min开辟空间

    SKIP_ELEMENT

    num_nonzeros++;

    }

    }

    rewind(fp);//重新指向文件流的开头

    feature_max = (double *)malloc((max_index+1)* sizeof(double));

    feature_min = (double *)malloc((max_index+1)* sizeof(double));

    if(feature_max == NULL || feature_min == NULL)

    {

    fprintf(stderr,"can't allocate enough memory/n");

    exit(1);

    }

     

    for(i=0;i<=max_index;i++)

    {

    feature_max[i]=-DBL_MAX;

    feature_min[i]=DBL_MAX;

    }

     

    /* pass 2: find out min/max value */

    while(readline(fp)!=NULL)//读入一个样本数据

    {

    char *p=line;

    int next_index=1;

    double target;

    double value;

     

    sscanf(p,"%lf",&target);//y_scaling 有关??????

    y_max = max(y_max,target);

    y_min = min(y_min,target);

    SKIP_TARGET

     

    while(sscanf(p,"%d:%lf",&index,&value)==2) //从一个样本中读入每一个index和对应的value

    {

    for(i=next_index;i<index;i++)

    {

    feature_max[i]=max(feature_max[i],0); //这两句是为了出现index0:value0 index2:value2的时候,将中间省略的0项 

    feature_min[i]=min(feature_min[i],0);//index1:value1也考虑进内

    }

    feature_max[index]=max(feature_max[index],value);//取最大值

    feature_min[index]=min(feature_min[index],value);//取最小值

     

    SKIP_ELEMENT

    next_index=index+1;

    }

     

    for(i=next_index;i<=max_index;i++)  //对于读入此样本数据,可能总属性个数不能达到最大个数max_index,此处需要考虑此样本

    //后面省略的那些0值

    {

    feature_max[i]=max(feature_max[i],0);

    feature_min[i]=min(feature_min[i],0);

    }

    }

     

    rewind(fp);

     

    /* pass 2.5: save/restore feature_min/feature_max */

    if(restore_filename)

    {

    /* fp_restore rewinded in finding max_index */

    int idx, c;

    double fmin, fmax;

    if((c = fgetc(fp_restore)) == 'y')

    {

    fscanf(fp_restore, "%lf %lf/n", &y_lower, &y_upper);

    fscanf(fp_restore, "%lf %lf/n", &y_min, &y_max);

    y_scaling = 1;

    }

    else

    ungetc(c, fp_restore);

     

    if (fgetc(fp_restore) == 'x') {

    fscanf(fp_restore, "%lf %lf/n", &lower, &upper);

    while(fscanf(fp_restore,"%d %lf %lf/n",&idx,&fmin,&fmax)==3)//如果指定-r,则feature_max,feature_min的值以             //fp_restore中存的每一个列的最大最小值为准,前面从待缩放文件中求出的每列最大最小值就没用了。此时缩放之后的值有可能不在

    //[lower,upper]之内。很明显这是合理的,因为待测试的样本(即需要-r来缩放的,都是不可靠的数据,所以不能将他们的最值作为缩

    //放的标准,而用-s参数时,因为样本数据都是训练样本都是经过标记的,所以可以作为缩放的标准

    {

    if(idx<=max_index)

    {

    feature_min[idx] = fmin;

    feature_max[idx] = fmax;

    }

    }

    }

    fclose(fp_restore);

    }

    if(save_filename)//将feature_max,feature_min存成文件

    {

    FILE *fp_save = fopen(save_filename,"w");

    if(fp_save==NULL)

    {

    fprintf(stderr,"can't open file %s/n", save_filename);

    exit(1);

    }

    if(y_scaling)

    {

    fprintf(fp_save, "y/n");

    fprintf(fp_save, "%.16g %.16g/n", y_lower, y_upper);

    fprintf(fp_save, "%.16g %.16g/n", y_min, y_max);

    }

    fprintf(fp_save, "x/n");

    fprintf(fp_save, "%.16g %.16g/n", lower, upper);

    for(i=1;i<=max_index;i++)

    {

    if(feature_min[i]!=feature_max[i])

    fprintf(fp_save,"%d %.16g %.16g/n",i,feature_min[i],feature_max[i]);

    }

    fclose(fp_save);

    }

    /* pass 3: scale */

    while(readline(fp)!=NULL)//对样本数据进行缩放

    {

    char *p=line;

    int next_index=1;

    double target;

    double value;

    sscanf(p,"%lf",&target);

    output_target(target);

     

    SKIP_TARGET

     

    while(sscanf(p,"%d:%lf",&index,&value)==2)

    {

    for(i=next_index;i<index;i++)//对样本数据中那些空的属性进行以0值缩放

    output(i,0);

     

    output(index,value);//对非0值进行缩放

     

    SKIP_ELEMENT

    next_index=index+1;

    }

     

    for(i=next_index;i<=max_index;i++)//对剩下的空值也以0值进行缩放

    output(i,0);

     

    printf("/n");

    }

     

    if (new_num_nonzeros > num_nonzeros)

    fprintf(stderr, 

    "Warning: original #nonzeros %ld/n"

    "         new      #nonzeros %ld/n"

    "Use -l 0 if many original feature values are zeros/n",

    num_nonzeros, new_num_nonzeros);

     

    free(line);

    free(feature_max);

    free(feature_min);

    fclose(fp);

    return 0;

    }

     

    char* readline(FILE *input)//读入一个样本,注意是一个样本,不是一个行数据

    {

    int len;

    if(fgets(line,max_line_len,input) == NULL)//读入一行

    return NULL;

     

    while(strrchr(line,'/n') == NULL)//判断是不是到了一个样本的末尾‘/n'

    {

    max_line_len *= 2;

    line = (char *) realloc(line, max_line_len);//将空间加2倍

    len = (int) strlen(line);

    if(fgets(line+len,max_line_len-len,input) == NULL)//读入一行的新的数据,并加到line的后面

    break;

    }

    return line;

    }

     

    void output_target(double value)

    {

    if(y_scaling)

    {

    if(value == y_min)

    value = y_lower;

    else if(value == y_max)

    value = y_upper;

    else value = y_lower + (y_upper-y_lower) *

        (value - y_min)/(y_max-y_min);

    }

    printf("%g ",value);

    }

     

    void output(int index, double value)

    {

    /* skip single-valued attribute */

    if(feature_max[index] == feature_min[index])//一列的最大值和最小值相等,不用处理

    return;

     

    if(value == feature_min[index])

    value = lower;

    else if(value == feature_max[index])

    value = upper;

    else

    value = lower + (upper-lower) * 

    (value-feature_min[index])/

    (feature_max[index]-feature_min[index]);

     

    if(value != 0)

    {

    printf("%d:%g ",index, value);

    new_num_nonzeros++;

    }

    }

    注意这里代码没有提到将缩放后的数据存放到哪,这是由命令行提供的“> out.scale "决定的,指将原本要输出到控制台中的数据存储到一个指定的文件out.scale中;如果不加"> out.scale”,则直接输出到控制台中!

     


    最新回复(0)