关联block

    技术2024-07-24  63

    1.5.3 关联block_device结构

    接下来是register_disk函数,来自fs/partitions/check.c

     

        473 /* Not exported, helper to add_disk(). */

        474 void register_disk(struct gendisk *disk)

        475 {

        476         struct block_device *bdev;

        477         char *s;

        478         int i;

        479         struct hd_struct *p;

        480         int err;

        481

        482         strlcpy(disk->kobj.name,disk->disk_name,KOBJ_NAME_LEN);

        483         /* ewww... some of these buggers have / in name... */

        484         s = strchr(disk->kobj.name, '/');

        485         if (s)

        486                 *s = '!';

        487         if ((err = kobject_add(&disk->kobj)))

        488                 return;

        489         err = disk_sysfs_symlinks(disk);

        490         if (err) {

        491                 kobject_del(&disk->kobj);

        492                 return;

        493         }

        494         disk_sysfs_add_subdirs(disk);

        495

        496         /* No minors to use for partitions */

        497         if (disk->minors == 1)

        498                 goto exit;

        499

        500         /* No such device (e.g., media were just removed) */

        501         if (!get_capacity(disk))

        502                 goto exit;

        503

        504         bdev = bdget_disk(disk, 0);

        505         if (!bdev)

        506                 goto exit;

        507

        508         /* scan partition table, but suppress uevents */

        509         bdev->bd_invalidated = 1;

        510         disk->part_uevent_suppress = 1;

        511         err = blkdev_get(bdev, FMODE_READ, 0);

        512         disk->part_uevent_suppress = 0;

        513         if (err < 0)

        514                 goto exit;

        515         blkdev_put(bdev);

        516

        517 exit:

        518         /* announce disk after possible partitions are already created */

        519         kobject_uevent(&disk->kobj, KOBJ_ADD);

        520

        521         /* announce possible partitions */

        522         for (i = 1; i < disk->minors; i++) {

        523                 p = disk->part[i-1];

        524                 if (!p || !p->nr_sects)

        525                         continue;

        526                 kobject_uevent(&p->kobj, KOBJ_ADD);

        527         }

        528 }

     

    首先487行这个kobject_add的作用是很直观的,在Sysfs中为这块磁盘建一个子目录,例如我们为的硬盘建立一个块设备驱动,则会在/sys/block/目录中看到一个sdf,要是把这个调用kobject_add函数这行注释掉,肯定就看不到这个sdf目录。这里有两个问题

     

    第一为什么kobject_add这么一调用生成的这个子目录的名字就叫做sdf”,而不叫做别的呢其实在sd_probe中做过这么一件事情,通过精心计算得到disk_name而这个disk_name正是struct gendisk的一个成员这里我们看到482行我们把disk_name给了kobj.name这就是为什么我们调用kobject_add添加一个kobject的时候它的名字就是我们当时的disk_name

     

    第二为什么生成的这个子目录是在/sys/block目录下面而不是在别的位置呢还记得在alloc_disk_node中我们申请struct gendisk的情景么kobj_set_kset_s(disk,block_subsys)做的就是让disk对应的kobject从属于block_subsys对应的kobject下面这就是为什么我们现在添加这个kobject的时候它很自然的就会在/sys/block子目录下面建立文件

     

    继续走disk_sysfs_symlinks来自fs/partitions/check.c这个函数虽然不短但是比较浅显易懂。

     

    static int disk_sysfs_symlinks(struct gendisk *disk){

             struct device *target = get_device(disk->driverfs_dev);

             int err;

             char *disk_name = NULL;

     

             if (target) {

                     disk_name = make_block_name(disk);

                     if (!disk_name) {

                             err = -ENOMEM;

                             goto err_out;

                     }

     

                     err = sysfs_create_link(&disk->kobj, &target->kobj, "device");

                     if (err)

                             goto err_out_disk_name;

     

                     err = sysfs_create_link(&target->kobj, &disk->kobj, disk_name);

                     if (err)

                             goto err_out_dev_link;

             }

     

             err = sysfs_create_link(&disk->kobj, &block_subsys.kobj,

                                     "subsystem");

             if (err)

                     goto err_out_disk_name_lnk;

     

             kfree(disk_name);

     

             return 0;

     

    err_out_disk_name_lnk:

             if (target) {

                     sysfs_remove_link(&target->kobj, disk_name);

    err_out_dev_link:

                     sysfs_remove_link(&disk->kobj, "device");

    err_out_disk_name:

                     kfree(disk_name);

    err_out:

                     put_device(target);

             }

             return err;

    }

     

    我们用实际效果来解读这个函数。首先我们看正常工作的U盘会在/sys/block/sdf下面有哪些内容:

    [root@localhost ~]# ls /sys/block/sdf/

    capability  dev  device  holders  queue  range  removable  size  slaves  stat  subsystem  uevent

     

    第一个sysfs_create_link创建的就是这里这个device这个软链接文件。我们来看它链接到哪里去了:

    [root@localhost ~]# ls -l /sys/block/sdf/device

    lrwxrwxrwx 1 root root 0 Dec 13 07:09

    /sys/block/sdf/device

    -> ../../devices/pci0000:00/0000:00:1d.7/usb4/4-4/4-4:1.0/host24/target24:0:0/24:0:0:0

     

    第二个sysfs_create_link则从那边又建立一个反链接,又给链接回来了:

    [root@localhost~]# ls -l ……

    lrwxrwxrwx 1 root root 0 Dec 13 21:16

    /sys/devices/pci0000:00/0000:00:1d.7/usb4/4-4/4-4:1.0/host24/target24:0:0/24:0:0:0/block:sdf -> ../../../../../../../../../block/sdf

     

    于是这就等于你中有我我中有你,你那边有一个文件链接到了我这边,我这边有一个文件链接到了你那边。第三个sysfs_create_link,生成的是/sys/block/sdf/subsystem这个软链接文件。

    [root@localhost ~]# ls -l /sys/block/sdf/subsystem

    lrwxrwxrwx 1 root root 0 Dec 13 07:09 /sys/block/sdf/subsystem -> ../../block

     

    三个链接文件建立好之后,disk_sysfs_symlinks也就结束了它的使命。接下来一个函数是disk_sysfs_add_subdirs。同样来自fs/partitions/check.c

     

    static inline void disk_sysfs_add_subdirs(struct gendisk *disk){

             struct kobject *k;

     

             k = kobject_get(&disk->kobj);

             disk->holder_dir = kobject_add_dir(k, "holders");

             disk->slave_dir = kobject_add_dir(k, "slaves");

             kobject_put(k);

    }

     

    这个函数的意图太明显了,无非就是建立holdersslaves两个子目录。

     

    504行接着调用一个内联函数,bdget_disk,《Thinking in C++》告诉我们内联函数最好定义在头文件中,所以这个函数来自include/linux/genhd.h

     

    static inline struct block_device *bdget_disk(struct gendisk *disk, int index){

             return bdget(MKDEV(disk->major, disk->first_minor) + index);

    }

     

    bdget来自fs/block_dev.c

     

    struct block_device *bdget(dev_t dev){

             struct block_device *bdev;

             struct inode *inode;

     

             inode = iget5_locked(bd_mnt->mnt_sb, hash(dev),

                             bdev_test, bdev_set, &dev);

     

             if (!inode)

                     return NULL;

     

             bdev = &BDEV_I(inode)->bdev;

     

             if (inode->i_state & I_NEW) {

                     bdev->bd_contains = NULL;

                     bdev->bd_inode = inode;

                     bdev->bd_block_size = (1 << inode->i_blkbits);

                     bdev->bd_part_count = 0;

                     bdev->bd_invalidated = 0;

                     inode->i_mode = S_IFBLK;

                     inode->i_rdev = dev;

                     inode->i_bdev = bdev;

                     inode->i_data.a_ops = &def_blk_aops;

                     mapping_set_gfp_mask(&inode->i_data, GFP_USER);

                     inode->i_data.backing_dev_info = &default_backing_dev_info;

                     spin_lock(&bdev_lock);

                     list_add(&bdev->bd_list, &all_bdevs);

                     spin_unlock(&bdev_lock);

                     unlock_new_inode(inode);

             }

             return bdev;

    }

     

    这个函数是什么意思呢,还记得前面讲过的struct block_device数据结构,以及我们的老熟人struct inode数据结构。不错,Linux中每一个Block设备都由这么一个结构体变量表示,这玩意儿因此被称作块设备描述符。inode我们不多讲,但是这里一个很重要的结构体是struct bdev_inode

     

    struct bdev_inode {

             struct block_device bdev;

             struct inode vfs_inode;

    };

     

    bdev_inode好像没出现过,用来干嘛呢?我们来看看BDEV_I函数,这个内联函数来自fs/block_dev.c

     

    static inline struct bdev_inode *BDEV_I(struct inode *inode){

             return container_of(inode, struct bdev_inode, vfs_inode);

    }

     

    很显然,从inode得到相应的bdev_inode。于是这个&BDEV_I(inode)->bdev表示的就是inode对应的bdev_inode的成员struct block_device bdev

     

    但是bdev结构体变量是不会自动来到你的面前,需要的时候你要去申请才会有。iget5_locked就是干这件事情的,这个函数来自fs/inode.c,跟我们前面接触到的iget类似。我们显然不会去深入看它,只能告诉你,这个函数这么一执行,我们就既有inode又有block_device了,而且对于第一次申请的inode,其i_state成员是设置了I_NEW这个flag的,所以bdget()函数中,最后一段if语句是要被执行的。这一段if语句的作用就是初始化inode结构体指针inode以及block_device结构体指针bdev。而函数最终返回的也正是bdev。需要强调一下,bdev正是从这一刻开始正式崭露头角的。

     

    回到register_disk()中,继续往下。下一个重量级的函数是blkdev_get,来自fs/block_dev.c

     

    static int __blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags,

                             int for_part){

             struct file fake_file = {};

             struct dentry fake_dentry = {};

             fake_file.f_mode = mode;

             fake_file.f_flags = flags;

             fake_file.f_path.dentry = &fake_dentry;

             fake_dentry.d_inode = bdev->bd_inode;

     

             return do_open(bdev, &fake_file, for_part);

    }

     

    int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags){

             return __blkdev_get(bdev, mode, flags, 0);

    }

     

    看到blkdev_get调用的是__blkdev_get,所以我们两个函数一块贴出来了。很显然,真正需要看的却是do_open,来自同一个文件,我们来详细讨论一下:

     

       1110 static int do_open(struct block_device *bdev, struct file *file, int for_part)

       1111 {

       1112         struct module *owner = NULL;

       1113         struct gendisk *disk;

       1114         int ret = -ENXIO;

       1115         int part;

       1116

       1117         file->f_mapping = bdev->bd_inode->i_mapping;

       1118         lock_kernel();

       1119         disk = get_gendisk(bdev->bd_dev, &part);  /* part肯定为0 */

       1120         if (!disk) {

       1121                 unlock_kernel();

       1122                 bdput(bdev);

       1123                 return ret;

       1124         }

       1125         owner = disk->fops->owner;

       1126

       1127         mutex_lock_nested(&bdev->bd_mutex, for_part);

       1128         if (!bdev->bd_openers) {

       1129                 bdev->bd_disk = disk;

       1130                 bdev->bd_contains = bdev;

       1131                 if (!part) {

       1132                         struct backing_dev_info *bdi;

       1133                         if (disk->fops->open) {

       1134                                 ret = disk->fops->open(bdev->bd_inode, file);

       1135                                 if (ret)

       1136                                         goto out_first;

       1137                         }

       1138                         if (!bdev->bd_openers) {

       1139                                 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);

       1140                                 bdi = blk_get_backing_dev_info(bdev);

       1141                                 if (bdi == NULL)

       1142                                         bdi = &default_backing_dev_info;

       1143                                 bdev->bd_inode->i_data.backing_dev_info = bdi;

       1144                         }

       1145                         if (bdev->bd_invalidated)

       1146                                 rescan_partitions(disk, bdev);

       1147                 } else {

       1148                         struct hd_struct *p;

       1149                         struct block_device *whole;

       1150                         whole = bdget_disk(disk, 0);

       1151                         ret = -ENOMEM;

       1152                         if (!whole)

       1153                                 goto out_first;

       1154                         BUG_ON(for_part);

       1155                         ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1);

       1156                         if (ret)

       1157                                 goto out_first;

       1158                         bdev->bd_contains = whole;

       1159                         p = disk->part[part - 1];

       1160                         bdev->bd_inode->i_data.backing_dev_info =

       1161                            whole->bd_inode->i_data.backing_dev_info;

       1162                         if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) {

       1163                                 ret = -ENXIO;

       1164                                 goto out_first;

       1165                         }

       1166                         kobject_get(&p->kobj);

       1167                         bdev->bd_part = p;

       1168                         bd_set_size(bdev, (loff_t) p->nr_sects << 9);

       1169                 }

       1170         } else {

       1171                 put_disk(disk);

       1172                 module_put(owner);

       1173                 if (bdev->bd_contains == bdev) {

       1174                         if (bdev->bd_disk->fops->open) {

       1175                                ret = bdev->bd_disk->fops->open(bdev->bd_inode, file);

       1176                                if (ret)

       1177                                         goto out;

       1178                         }

       1179                         if (bdev->bd_invalidated)

       1180                                 rescan_partitions(bdev->bd_disk, bdev);

       1181                 }

       1182         }

       1183         bdev->bd_openers++;

       1184         if (for_part)

       1185                 bdev->bd_part_count++;

       1186         mutex_unlock(&bdev->bd_mutex);

       1187         unlock_kernel();

       1188         return 0;

       1189

       1190 out_first:

       1191         bdev->bd_disk = NULL;

       1192         bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info;

       1193         if (bdev != bdev->bd_contains)

       1194                 __blkdev_put(bdev->bd_contains, 1);

       1195         bdev->bd_contains = NULL;

       1196         put_disk(disk);

       1197         module_put(owner);

       1198 out:

       1199         mutex_unlock(&bdev->bd_mutex);

       1200         unlock_kernel();

       1201         if (ret)

       1202                 bdput(bdev);

       1203         return ret;

       1204 }

     

    一开始的时候,bd_openers是被初始化为了0,所以1128这个if语句是要被执行的。bd_openers0表示一个文件还没有被打开过。

     

    一开始我们还没有涉及到分区的信息,所以一开始我们只有sda这个概念,而没有sda1sda2sda3…这些概念。这时候我们调用get_gendisk得到的part一定是0。所以1131行的if语句也会执行。而disk->fops->open很明显,就是sd_open(因为我们在sd_probe中曾经设置了gd->fops等于&sd_fops)

     

    但此时此刻我们执行sd_open实际上是不做什么正经事儿的。顶多就是测试一下看看sd_open能不能执行,如果能执行,那么就返回0。如果根本就不能执行,那就赶紧汇报错误。

     

    接下来还有几个函数,主要做一些赋值,暂时不去看它,等到适当的时候需要看了再回来看。

     

    1146行这个rescan_partitions()显然是我们要看的,首先我们在调用blkdev_get之前把bd_invalidated设置为了1,所以这个函数这次一定会被执行。从这一刻开始分区信息闯入了我们的生活。这个函数来自fs/partitions/check.c

     

        530 int rescan_partitions(struct gendisk *disk, struct block_device *bdev)

        531 {

        532         struct parsed_partitions *state;

        533         int p, res;

        534

        535         if (bdev->bd_part_count)

        536                 return -EBUSY;

        537         res = invalidate_partition(disk, 0);

        538         if (res)

        539                 return res;

        540         bdev->bd_invalidated = 0;

        541         for (p = 1; p < disk->minors; p++)

        542                 delete_partition(disk, p);

        543         if (disk->fops->revalidate_disk)

        544                 disk->fops->revalidate_disk(disk);

        545         if (!get_capacity(disk) || !(state = check_partition(disk, bdev)))

        546                 return 0;

        547         if (IS_ERR(state))      /* I/O error reading the partition table */

        548                 return -EIO;

        549         for (p = 1; p < state->limit; p++) {

        550                 sector_t size = state->parts[p].size;

        551                 sector_t from = state->parts[p].from;

        552                 if (!size)

        553                         continue;

        554                 if (from + size > get_capacity(disk)) {

        555                         printk(" %s: p%d exceeds device capacity/n",

        556                                 disk->disk_name, p);

        557                 }

        558                 add_partition(disk, p, from, size, state->parts[p].flags);

        559 #ifdef CONFIG_BLK_DEV_MD

        560                 if (state->parts[p].flags & ADDPART_FLAG_RAID)

        561                         md_autodetect_dev(bdev->bd_dev+p);

        562 #endif

        563         }

        564         kfree(state);

        565         return 0;

        566 }

     

    这个函数执行过后,关于分区的信息我们就算都有了。关于分区,前面我们看到是用struct hd_struct这么个结构体来表示的,而struct hd_struct也正是struct gendisk的成员,并且是个二级指针。接着,get_capacity()。没有比这个函数更简单的函数了。来自include/linux/genhd.h

     

    static inline sector_t get_capacity(struct gendisk *disk){

             return disk->capacity;

    }

     

    check_partition就稍微复杂一些了,来自fs/partitions/check.c,我们就不多讲了,这个函数主要是利用parsed_partitions数据结构来记录分区信息的,并且调用check_part来专门指定一个分区表格式,然后我们就来到了add_partition,仍然是来自fs/partitions/check.c

     

        371 void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags)

        372 {

        373         struct hd_struct *p;

        374

        375         p = kmalloc(sizeof(*p), GFP_KERNEL);

        376         if (!p)

        377                 return;

        378

        379         memset(p, 0, sizeof(*p));

        380         p->start_sect = start;

        381         p->nr_sects = len;

        382         p->partno = part;

        383         p->policy = disk->policy;

        384

        385         if (isdigit(disk->kobj.name[strlen(disk->kobj.name)-1]))

        386                snprintf(p->kobj.name,KOBJ_NAME_LEN,"%sp%d",disk->kobj.name,part);

        387         else

        388                snprintf(p->kobj.name,KOBJ_NAME_LEN,"%s%d",disk->kobj.name,part);

        389         p->kobj.parent = &disk->kobj;

        390         p->kobj.ktype = &ktype_part;

        391         kobject_init(&p->kobj);

        392         kobject_add(&p->kobj);

        393         if (!disk->part_uevent_suppress)

        394                 kobject_uevent(&p->kobj, KOBJ_ADD);

        395         sysfs_create_link(&p->kobj, &block_subsys.kobj, "subsystem");

        396         if (flags & ADDPART_FLAG_WHOLEDISK) {

        397                 static struct attribute addpartattr = {

        398                         .name = "whole_disk",

        399                         .mode = S_IRUSR | S_IRGRP | S_IROTH,

        400                         .owner = THIS_MODULE,

        401                 };

        402

        403                 sysfs_create_file(&p->kobj, &addpartattr);

        404         }

        405         partition_sysfs_add_subdir(p);

        406         disk->part[part-1] = p;

        407 }

     

    有了之前的经验,现在再看这些kobject相关的,sysfs相关的函数就容易多了。

     

    389行这个p->kobj.parent = &disk->kobj保证了我们接下来生成的东西在刚才的目录之下,即sda1sda2、…在sda目录下。而395sysfs_create_link的效果也很显然。而partition_sysfs_add_subdir也没什么好说的,添加了holders子目录。

     

    最后,让我们记住这个函数做过的一件事情,对p的各个成员进行了赋值,而在函数的结尾处把disk->part[part-1]指向了p。也就是说,从此以后,struct hd_struct这个指针数组里就应该有内容了,而不再是空的。

     

    到这里,rescan_partitions()宣告结束,回到do_open().1183行,让bd_openers这个引用计数增加1,如果for_part有值,那么就让它对应的引用计数也加1。然后do_open也就华丽丽的结束了,像多米诺骨牌一样,__blkdev_getblkdev_get相继返回。blkdev_putblkdev_get做的事情基本相反,我们就不看了,只是需要注意,它把刚才增加上去的这两个引用计数给减了回去。

     

    最后,register_disk()中调用的最后一个函数就是kobject_uevent(),这个函数就是通知用户空间的进程udevd,告诉它有事件发生了,如果你使用的发行版正确配置了udev的配置文件(详见/etc/udev/目录下),那么其效果就是让/dev目录下面有了相应的设备文件。比如:

     

    [root@localhost tedkdb]# ls /dev/sda*

    /dev/sda   /dev/sda10  /dev/sda12  /dev/sda14  /dev/sda2  /dev/sda5  /dev/sda7  /dev/sda9 /dev/sda1  /dev/sda11  /dev/sda13  /dev/sda15  /dev/sda3  /dev/sda6  /dev/sda8

     

    至于为什么,你可以去阅读关于udev的知识,这是用户空间的程序,咱们就不多说了。

     

    最新回复(0)