0、写在前面

关于如何下手走读 Linux 的源码，每个人都有不同的理解。从 UNIX 一贯而来的“一切皆文件”思想来看，应该从文件系统着手分析，因为许多其他的模块都使用到了文件系统。

为了支持各种各样的文件系统实列，统一各种不同文件系统的接口，Linux 将文件系统公共部分抽象出来，形成了虚拟文件系统（virtual file system, vfs）。

虚拟文件系统是 Linux 内核的一个组件，用于处理与文件和文件系统相关的所有系统调用。VFS 是内核提供文件系统接口给用户态应用程序通用接口层，同时也提供了抽象化操作接口，以便底层各种文件系统实现。

本文是基于 Linux 5.10.127 源码分析，涉及到文件系统实例以 ext4 为例。

1、vfs 总体架构

vfs 涉及到的数据结构比较复杂，比较重要的是下面 5 个：

file_system_type：表示某种文件系统
super_block：表示一个文件系统实例
dentry：表述目录
inode：索引节点
file：进程角度打开的文件

上述 5 个数据结构，总体的布局如下

file_system_type 表示一种文件系统，比如 ext2、ext4 等，表示某类型文件系统。Linux 系统中将所有的 file_system_type 结构体都用一个单向链表串联起来，头节点是 file_systems。

而 super_block 代表具体某个已经挂载的文件系统，标识一个文件系统实例。比如某个硬盘是 ext4 文件系统，挂载后就是 ext4 文件系统的一个实例。1）所属同一个 file_system_type 的 super_block 结构体使用单向链表链接；2）系统中所有 super_block 都链接到一个双向链表上，头节点是 super_blocks。

比如 ext4 文件系统，将多个 block 组织成一个 block，每个 block 中 super_block 位于偏移 1024 处。

dentry 表示目录，Linux 将目录也当作文件，也有一个 inode。两者有什么区别呢？区别在于操作函数，dentry 的 inode 操作函数和普通文件操作函数不同。

inode 包含了内核在操作文件或目录时需要的全部信息。对于 UNIX 风格的文件系统，这些信息可以根据需要从磁盘索引结点直接读入或者写回磁盘。磁盘上的一个索引结点代表一个文件，内核中一个 inode 代表打开的一个文件，是打开文件的唯一标识。

file 是从进程角度表示一个打开的文件，不同进程或者同一进程的 file 可以执行同一个 inode。

2、file_system_type

Linux 使用 file_system_type 表示一种文件系统，比如 ext2、ext4、exfat 等等。文件系统可以有多个实例，每个实例都使用 super_block 表征。

关于 fs_context 和 fs_parameter_spec 这里先跳过，后面讲 mount 的时候再分析。

/// include/linux/fs.h
struct file_system_type {
        const char *name;
        int fs_flags;
#define FS_REQUIRES_DEV         1
#define FS_BINARY_MOUNTDATA     2
#define FS_HAS_SUBTYPE          4
#define FS_USERNS_MOUNT         8       /* Can be mounted by userns root */
#define FS_DISALLOW_NOTIFY_PERM 16      /* Disable fanotify permission events */
#define FS_THP_SUPPORT          8192    /* Remove once all fs converted */
#define FS_RENAME_DOES_D_MOVE   32768   /* FS will handle d_move() during rename() internally. */
        int (*init_fs_context)(struct fs_context *);
        const struct fs_parameter_spec *parameters;
        struct dentry *(*mount) (struct file_system_type *, int,
                       const char *, void *);
        void (*kill_sb) (struct super_block *);
        struct module *owner;
        struct file_system_type * next;
        struct hlist_head fs_supers;

        struct lock_class_key s_lock_key;
        struct lock_class_key s_umount_key;
        struct lock_class_key s_vfs_rename_key;
        struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];

        struct lock_class_key i_lock_key;
        struct lock_class_key i_mutex_key;
        struct lock_class_key i_mutex_dir_key;
};

name 表示文件系统名字，比如 ext2、ext4；
fs_flags 是一些 FS_ 标志位；
init_fs_context/parameters 和 fs_context 有关系，挂载的时候用到；
mount 函数指针指向挂载文件系统实例的函数；
kill_sb 看起来是和释放 super_block 有关系
next 指针用于将文件系统链接成链表；
fs_supers 是链表头，用于链接文件系统的所有实例；

2.1、register_filesystem()/unregister_filesystem()

在挂载某个文件系统实例前，Linux 必须支持该文件系统。换句话说，需要将该文件系统 file_system_type 注册到 Linux。向 Linux 注册文件系统是通过 register_filesystem() 函数。

Linux 使用链表管理注册的文件系统，所有的文件系统都被链接到一个链表上，全局变量 file_systems 指向该链表的第一个元素。

 /// fs/filesystem.c
static struct file_system_type *file_systems;
static DEFINE_RWLOCK(file_systems_lock); // 读写锁，并发控制

Linux 不允许同一个文件系统重复注册。find_filesystem() 函数可以查找是否已经注册某个文件系统。find_filesystem() 函数返回的是指针的指针，如果存在，*p 指向该文件系统，否则 *p 为 NULL。

从下面的 for 循环可以看到，指针 p 其实指向的是 next 指针。*p 就是 next 的值。如果 *p 为 NULL，next 的值为 NULL，表示链表已经到最后一个节点了。

 /// fs/filesystem.c
static struct file_system_type **find_filesystem(const char *name, unsigned len)
{
        struct file_system_type **p;
        for (p = &file_systems; *p; p = &(*p)->next)
                if (strncmp((*p)->name, name, len) == 0 &&
                    !(*p)->name[len])
                        break;
        return p;
}

find_filesystem() 函数的处理手法值得我们学习：如何将查找和插入结合，在查找失败时，可以利用查找结果直接插入元素。

register_filesystem() 函数将一个文件系统注册到全局链表 file_systems 上，如果已经存在，则返回 -EBUSY，表示注册失败。其主要逻辑是调用 find_filesystem() 函数查找链表中是否已经存在待注册文件系统，不存在时才将待注册文件系统添加到链表中。

 /// fs/filesystem.c
int register_filesystem(struct file_system_type * fs)
{
        int res = 0;
        struct file_system_type ** p;

        if (fs->parameters &&
            !fs_validate_description(fs->name, fs->parameters))
                return -EINVAL;

        BUG_ON(strchr(fs->name, '.'));
        if (fs->next) // next 不干净，指向了一个 file_system_type
                return -EBUSY;
        write_lock(&file_systems_lock); // 互斥访问获取读写锁
        p = find_filesystem(fs->name, strlen(fs->name));
        if (*p) // 已经存在
                res = -EBUSY;
        else
                *p = fs;
        write_unlock(&file_systems_lock); // 释放读写锁
        return res;
}

unregister_filesystem() 函数是从全局链表 file_systems 中删除某个文件系统。

/// fs/filesystem.c
int unregister_filesystem(struct file_system_type * fs)
{
        struct file_system_type ** tmp;

        write_lock(&file_systems_lock); // 获取读写锁
        tmp = &file_systems;
        while (*tmp) {
                if (fs == *tmp) {
                        *tmp = fs->next; // 从链表移除
                        fs->next = NULL;
                        write_unlock(&file_systems_lock);
                        synchronize_rcu();
                        return 0;
                }
                tmp = &(*tmp)->next;
        }
        write_unlock(&file_systems_lock); // 释放锁

        return -EINVAL;
}

2.2、get_fs_type()

get_fs_type() 函数根据名字 name 查找是否注册了某个文件系统，如果注册了就返回指向对应 file_system_type 的指针。

/// fs/filesystem.c
static struct file_system_type *__get_fs_type(const char *name, int len)
{
        struct file_system_type *fs;

        read_lock(&file_systems_lock);
        fs = *(find_filesystem(name, len));
        if (fs && !try_module_get(fs->owner))
                fs = NULL;
        read_unlock(&file_systems_lock);
        return fs;
}

struct file_system_type *get_fs_type(const char *name)
{
        struct file_system_type *fs;
        const char *dot = strchr(name, '.');
        int len = dot ? dot - name : strlen(name);

        fs = __get_fs_type(name, len);
        if (!fs && (request_module("fs-%.*s", len, name) == 0)) {
                fs = __get_fs_type(name, len);
                if (!fs)
                        pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n",
                                     len, name);
        }

        if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
                put_filesystem(fs);
                fs = NULL;
        }
        return fs;
}

2.3、ext4_fs_type

ext4 在 module 初始化时调用 register_filesystem() 函数将 ext4_fs_type 注册到 Linux 系统中。ext4_fs_type 定义如下。mount 和 kill_sb 函数指针分别指向 ext4_mount() 和 kill_block_super() 两个函数。

/// fs/ext4/super.c
static struct file_system_type ext4_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext4",
        .mount          = ext4_mount,
        .kill_sb        = kill_block_super,
        .fs_flags       = FS_REQUIRES_DEV,
};

ext4_mount() 函数直接调用 mount_bdev() 函数。

/// fs/ext4/super.c
static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
                       const char *dev_name, void *data)
{
        return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
}

mount_bdev() 和 kill_block_super() 两个函数都不是 ext4 专有函数，都是通用函数。

3、super_block

vfs 中 super_block 和某个文件系统实例（保存在硬盘）的 super_block 不一样，vfs 中 super_block 记录的是通用信息，不同的文件系统 super_block 记录的信息不一样。Linux 将通用的信息提取出来，使用 super_block 数据结构保存。

super_block 代表了一个具体某个已经挂载的文件系统，标识一个文件系统实例的信息。除了文件系统的关键信息（块长度、最大文件长度，等等）之外，超级块还包含了读、写、操作 inode的函数指针。常用的信息如下所示：

依附的物理硬件
索引结点 inode 和数据块 block 的位置
block 的大小（字节）
文件系统类型
最长文件名
最大文件大小
根目录的 inode 位置
支持的操作

3.1、struct super_block

super_block 结构定义如下，其中的几个对于理解 vfs 架构比较有帮助

s_list 将所有的 super_block 都链接成双向链表
s_type 指向所属的 file_system_type
s_op 指向 super_block 的操作 op，不同的文件系统，op 一般不同
s_fs_info 记录文件系统
s_instances 同属于一个 file_system 的 super_block 链接起来
s_inodes 保存所有的 inode
s_inodes_wb 保存需要回写的 inode

/// include/linux/fs.h
struct super_block {
        struct list_head        s_list; // 连接所有 super_block
        dev_t                   s_dev;
        unsigned char           s_blocksize_bits;
        unsigned long           s_blocksize;    /* 块大小 */
        loff_t                  s_maxbytes;     /* 最大文件大小 */
        struct file_system_type *s_type;        /* 所属文件系统 */
        const struct super_operations   *s_op;  /* super block 操作*/
        const struct dquot_operations   *dq_op;
        const struct quotactl_ops       *s_qcop;
        const struct export_operations *s_export_op;
        unsigned long           s_flags;
        unsigned long           s_iflags;       /* internal SB_I_* flags */
        unsigned long           s_magic;
        struct dentry           *s_root; /* 挂载点 */
        struct rw_semaphore     s_umount;
        int                     s_count; /* 引用计数 */
        atomic_t                s_active;
#ifdef CONFIG_SECURITY
        void                    *s_security;
#endif
        const struct xattr_handler **s_xattr;
#ifdef CONFIG_FS_ENCRYPTION
        const struct fscrypt_operations *s_cop;
        struct key              *s_master_keys; /* master crypto keys in use */
#endif
#ifdef CONFIG_FS_VERITY
        const struct fsverity_operations *s_vop;
#endif
#ifdef CONFIG_UNICODE
        struct unicode_map *s_encoding;
        __u16 s_encoding_flags;
#endif
        struct hlist_bl_head    s_roots;        /* alternate root dentries for NFS */
        struct list_head        s_mounts;       /* list of mounts; _not_ for fs use */
        struct block_device     *s_bdev;
        struct backing_dev_info *s_bdi;
        struct mtd_info         *s_mtd;
        struct hlist_node       s_instances; // 连接同属一个 filesystem
        unsigned int            s_quota_types;  /* Bitmask of supported quota types */
        struct quota_info       s_dquot;        /* Diskquota specific options */

        struct sb_writers       s_writers;

        /*
         * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
         * s_fsnotify_marks together for cache efficiency. They are frequently
         * accessed and rarely modified.
         */
        void                    *s_fs_info;     /* Filesystem private info */

        /* Granularity of c/m/atime in ns (cannot be worse than a second) */
        u32                     s_time_gran;
        /* Time limits for c/m/atime in seconds */
        time64_t                   s_time_min;
        time64_t                   s_time_max;
#ifdef CONFIG_FSNOTIFY
        __u32                   s_fsnotify_mask;
        struct fsnotify_mark_connector __rcu    *s_fsnotify_marks;
#endif

        char                    s_id[32];       /* Informational name */
        uuid_t                  s_uuid;         /* UUID */

        unsigned int            s_max_links;
        fmode_t                 s_mode;

        /*
         * The next field is for VFS *only*. No filesystems have any business
         * even looking at it. You had been warned.
         */
        struct mutex s_vfs_rename_mutex;        /* Kludge */

        /*
         * Filesystem subtype.  If non-empty the filesystem type field
         * in /proc/mounts will be "type.subtype"
         */
        const char *s_subtype;

        const struct dentry_operations *s_d_op; /* default d_op for dentries */

        /*
         * Saved pool identifier for cleancache (-1 means none)
         */
        int cleancache_poolid;

        struct shrinker s_shrink;       /* per-sb shrinker handle */

        /* Number of inodes with nlink == 0 but still referenced */
        atomic_long_t s_remove_count;

        /* Pending fsnotify inode refs */
        atomic_long_t s_fsnotify_inode_refs;

        /* Being remounted read-only */
        int s_readonly_remount;

        /* per-sb errseq_t for reporting writeback errors via syncfs */
        errseq_t s_wb_err;

        /* AIO completions deferred from interrupt context */
        struct workqueue_struct *s_dio_done_wq;
        struct hlist_head s_pins;

        /*
         * Owning user namespace and default context in which to
         * interpret filesystem uids, gids, quotas, device nodes,
         * xattrs and security labels.
         */
        struct user_namespace *s_user_ns;

        /*
         * The list_lru structure is essentially just a pointer to a table
         * of per-node lru lists, each of which has its own spinlock.
         * There is no need to put them into separate cachelines.
         */
        struct list_lru         s_dentry_lru; // 缓存 dentry
        struct list_lru         s_inode_lru; // 缓存 inode
        struct rcu_head         rcu;
        struct work_struct      destroy_work;

        struct mutex            s_sync_lock;    /* sync serialisation lock */

        /*
         * Indicates how deep in a filesystem stack this SB is
         */
        int s_stack_depth;

        /* s_inode_list_lock protects s_inodes */
        spinlock_t              s_inode_list_lock ____cacheline_aligned_in_smp;
        struct list_head        s_inodes;       /* 所有 inode */

        spinlock_t              s_inode_wblist_lock;
        struct list_head        s_inodes_wb;    /* 回写 inode */
} __randomize_layout;

Linux 中挂载的 super_block 除了链接到对应的 file_system_type::fs_supers 链表上，还将其链接到全局链表 super_blocks 中。

/// fs/super.c  
static LIST_HEAD(super_blocks);
static DEFINE_SPINLOCK(sb_lock);

3.2、super_operations

super_operations 中定义了超级块支持的操作，是一组函数指针，指向比如 inode 分配、销毁与释放，以及将 inode 数据写回磁盘等函数。

/// include/linux/fs.h
struct super_operations {
        struct inode *(*alloc_inode)(struct super_block *sb); // 分配 inode
        void (*destroy_inode)(struct inode *); // 销毁 inode
        void (*free_inode)(struct inode *); // 释放 inode

        void (*dirty_inode) (struct inode *, int flags);
        int (*write_inode) (struct inode *, struct writeback_control *wbc);
        int (*drop_inode) (struct inode *);
        void (*evict_inode) (struct inode *);
        void (*put_super) (struct super_block *);
        int (*sync_fs)(struct super_block *sb, int wait);
        int (*freeze_super) (struct super_block *);
        int (*freeze_fs) (struct super_block *);
        int (*thaw_super) (struct super_block *);
        int (*unfreeze_fs) (struct super_block *);
        int (*statfs) (struct dentry *, struct kstatfs *);
        int (*remount_fs) (struct super_block *, int *, char *);
        void (*umount_begin) (struct super_block *);

        int (*show_options)(struct seq_file *, struct dentry *);
        int (*show_devname)(struct seq_file *, struct dentry *);
        int (*show_path)(struct seq_file *, struct dentry *);
        int (*show_stats)(struct seq_file *, struct dentry *);
#ifdef CONFIG_QUOTA
        ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
        ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
        struct dquot **(*get_dquots)(struct inode *);
#endif
        int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
        long (*nr_cached_objects)(struct super_block *,
                                  struct shrink_control *);
        long (*free_cached_objects)(struct super_block *,
                                    struct shrink_control *);
};

3.3、ext4_sops

比如 ext4 文件系统，实现的 super_operations 如下 ext4_sops 所示。

/// fs/ext4/super.c
static const struct super_operations ext4_sops = {
        .alloc_inode    = ext4_alloc_inode,
        .free_inode     = ext4_free_in_core_inode,
        .destroy_inode  = ext4_destroy_inode,
        .write_inode    = ext4_write_inode,
        .dirty_inode    = ext4_dirty_inode,
        .drop_inode     = ext4_drop_inode,
        .evict_inode    = ext4_evict_inode,
        .put_super      = ext4_put_super,
        .sync_fs        = ext4_sync_fs,
        .freeze_fs      = ext4_freeze,
        .unfreeze_fs    = ext4_unfreeze,
        .statfs         = ext4_statfs,
        .remount_fs     = ext4_remount,
        .show_options   = ext4_show_options,
#ifdef CONFIG_QUOTA
        .quota_read     = ext4_quota_read,
        .quota_write    = ext4_quota_write,
        .get_dquots     = ext4_get_dquots,
#endif
        .bdev_try_to_free_page = bdev_try_to_free_page,
};

4、inode

要访问一个文件，一定要通过它的 inode 才能知道这个文件是什么类型的文件、是怎么组织的、文件中存储着多少数据、这些数据在什么地方以及其下层的驱动程序在哪等必要的信息。

每个 inode 都有一个唯一的整数，在每个文件系统实例里 inode 编号唯一。索引结点 inode 包含了内核在操作文件或目录时（目录也被当作文件看待）需要的全部信息。对于 UNIX 风格的文件系统，这些信息可以根据需要从磁盘索引结点直接读入或者写回磁盘。磁盘上的一个索引结点代表一个文件，内核中一个 inode 代表打开的一个文件。

4.1、struct inode

大部分成员用于管理简单的状态信息。例如 i_atime、i_mtime、t_ctime 分别存储了最后访问的时间、最后修改的时间、最后修改 inode 的时间。

文件访问权限和所有权保存在 i_mode（文件类型和访问权限）、i_uid 和 i_gid（与该文件相关的 UID 和 GID）中。

文件长度保存在 i_size，按字节计算。

文件和目录都有一个 inode，但是 i_op 指向的 inode_operations 实现却是不同。

/// include/linux/fs.h
struct inode {
        umode_t                 i_mode;
        unsigned short          i_opflags;
        kuid_t                  i_uid;
        kgid_t                  i_gid;
        unsigned int            i_flags;

#ifdef CONFIG_FS_POSIX_ACL
        struct posix_acl        *i_acl;
        struct posix_acl        *i_default_acl;
#endif

        const struct inode_operations   *i_op; // 操作函数
        struct super_block      *i_sb; // 所属的 super_block
        struct address_space    *i_mapping;

#ifdef CONFIG_SECURITY
        void                    *i_security;
#endif

        unsigned long           i_ino; // 每个 inode 唯一编号

        union {
                const unsigned int i_nlink; // 使用 inode 的计数
                unsigned int __i_nlink;
        };
        dev_t                   i_rdev; // 绑定的设备
        loff_t                  i_size; // 文件大小
        struct timespec64       i_atime; // 最后访问时间
        struct timespec64       i_mtime; // 最后修改时间
        struct timespec64       i_ctime; // 最后修改
        spinlock_t              i_lock;
        unsigned short          i_bytes;
        u8                      i_blkbits;
        u8                      i_write_hint;
        blkcnt_t                i_blocks; // 文件长度，block 个数

#ifdef __NEED_I_SIZE_ORDERED
        seqcount_t              i_size_seqcount;
#endif

        /* Misc */
        unsigned long           i_state;
        struct rw_semaphore     i_rwsem;

        unsigned long           dirtied_when;   /* jiffies of first dirtying */
        unsigned long           dirtied_time_when;

        struct hlist_node       i_hash;
        struct list_head        i_io_list;      /* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACK
        struct bdi_writeback    *i_wb;          /* the associated cgroup wb */

        /* foreign inode detection, see wbc_detach_inode() */
        int                     i_wb_frn_winner;
        u16                     i_wb_frn_avg_time;
        u16                     i_wb_frn_history;
#endif
        struct list_head        i_lru;          /* inode LRU list */
        struct list_head        i_sb_list;
        struct list_head        i_wb_list;      /* backing dev writeback list */
        union {
                struct hlist_head       i_dentry;
                struct rcu_head         i_rcu;
        };
        atomic64_t              i_version;
        atomic64_t              i_sequence; /* see futex */
        atomic_t                i_count;
        atomic_t                i_dio_count;
        atomic_t                i_writecount;
#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
        atomic_t                i_readcount; /* struct files open RO */
#endif
        union {
                const struct file_operations    *i_fop; // 文件操作
                void (*free_inode)(struct inode *);
        };
        struct file_lock_context        *i_flctx;
        struct address_space    i_data;
        struct list_head        i_devices;
        union {
                struct pipe_inode_info  *i_pipe; // 表示 pipe
                struct block_device     *i_bdev; // 表示块设备
                struct cdev             *i_cdev; // 表示字符设备
                char                    *i_link; // 表示链接
                unsigned                i_dir_seq;
        };

        __u32                   i_generation;

#ifdef CONFIG_FSNOTIFY
        __u32                   i_fsnotify_mask; /* all events this inode cares about */
        struct fsnotify_mark_connector __rcu    *i_fsnotify_marks;
#endif

#ifdef CONFIG_FS_ENCRYPTION
        struct fscrypt_info     *i_crypt_info;
#endif

#ifdef CONFIG_FS_VERITY
        struct fsverity_info    *i_verity_info;
#endif

        void                    *i_private; /* fs or device private pointer */
} __randomize_layout;

4.2、inode_operations

大多数情况下，各个函数指针成员的语义可以根据其名称推断。它们与对应的系统调用和用于空间工具在名称非常相似。例如，rmdir 删除目录，rename 重命名文件系统对象，等等。

尽管如此，并非所有名称有熟悉的命令对应。

/// include/linux/fs.h
struct inode_operations {
        struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
        const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
        int (*permission) (struct inode *, int);
        struct posix_acl * (*get_acl)(struct inode *, int);

        int (*readlink) (struct dentry *, char __user *,int);

        int (*create) (struct inode *,struct dentry *, umode_t, bool);
        int (*link) (struct dentry *,struct inode *,struct dentry *);
        int (*unlink) (struct inode *,struct dentry *);
        int (*symlink) (struct inode *,struct dentry *,const char *);
        int (*mkdir) (struct inode *,struct dentry *,umode_t);
        int (*rmdir) (struct inode *,struct dentry *);
        int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
        int (*rename) (struct inode *, struct dentry *,
                        struct inode *, struct dentry *, unsigned int);
        int (*setattr) (struct dentry *, struct iattr *);
        int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
        ssize_t (*listxattr) (struct dentry *, char *, size_t);
        int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
                      u64 len);
        int (*update_time)(struct inode *, struct timespec64 *, int);
        int (*atomic_open)(struct inode *, struct dentry *,
                           struct file *, unsigned open_flag,
                           umode_t create_mode);
        int (*tmpfile) (struct inode *, struct dentry *, umode_t);
        int (*set_acl)(struct inode *, struct posix_acl *, int);
} ____cacheline_aligned;

普通文件和目录文件的 inode_operations 不一样。Linux 将目录也当作文件，比如 ext4 文件对应的 inode_operatios 定义如下：

/// fs/ext4/file.c
const struct inode_operations ext4_file_inode_operations = {
        .setattr        = ext4_setattr,
        .getattr        = ext4_file_getattr,
        .listxattr      = ext4_listxattr,
        .get_acl        = ext4_get_acl,
        .set_acl        = ext4_set_acl,
        .fiemap         = ext4_fiemap,
};

而目录文件的 inode_operations 支持的函数比普通多，定义如下：

/// fs/ext4/namei.c
const struct inode_operations ext4_dir_inode_operations = {
        .create         = ext4_create,
        .lookup         = ext4_lookup,
        .link           = ext4_link,
        .unlink         = ext4_unlink,
        .symlink        = ext4_symlink,
        .mkdir          = ext4_mkdir,
        .rmdir          = ext4_rmdir,
        .mknod          = ext4_mknod,
        .tmpfile        = ext4_tmpfile,
        .rename         = ext4_rename2,
        .setattr        = ext4_setattr,
        .getattr        = ext4_getattr,
        .listxattr      = ext4_listxattr,
        .get_acl        = ext4_get_acl,
        .set_acl        = ext4_set_acl,
        .fiemap         = ext4_fiemap,
};

某些特殊文件的 inode_operations 定义如下

/// fs/ext4/namei.c
const struct inode_operations ext4_special_inode_operations = {
        .setattr        = ext4_setattr,
        .getattr        = ext4_getattr,
        .listxattr      = ext4_listxattr,
        .get_acl        = ext4_get_acl,
        .set_acl        = ext4_set_acl,
};

4.3、file_operations

文件不能只存储信息，必须容许操作其中的信息。从用户的角度来看，文件操作有标准库的函数执行。这些函数指示内核执行系统调用，然后系统调用执行所需的操作。

当然各个文件系统的实现接口不同，因而用于抽象文件操作的结构必须尽可能通用，以考虑到各种各样的目标文件。同时，它不能带有过多只适用于特定文件类型的专门操作。

各个 inode 实例都包含一个指向 file_operations 实例的指针 i_fop，该结构保存了指向所有可能文件操作的函数指针。该结构定义如下：

/// include/linux/fs.h
struct file_operations {
        struct module *owner;
        loff_t (*llseek) (struct file *, loff_t, int);
        ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
        ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
        ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
        ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
        int (*iopoll)(struct kiocb *kiocb, bool spin);
        int (*iterate) (struct file *, struct dir_context *);
        int (*iterate_shared) (struct file *, struct dir_context *);
        __poll_t (*poll) (struct file *, struct poll_table_struct *);
        long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
        long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
        int (*mmap) (struct file *, struct vm_area_struct *);
        unsigned long mmap_supported_flags;
        int (*open) (struct inode *, struct file *);
        int (*flush) (struct file *, fl_owner_t id);
        int (*release) (struct inode *, struct file *);
        int (*fsync) (struct file *, loff_t, loff_t, int datasync);
        int (*fasync) (int, struct file *, int);
        int (*lock) (struct file *, int, struct file_lock *);
        ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
        unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
        int (*check_flags)(int);
        int (*flock) (struct file *, int, struct file_lock *);
        ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
        ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
        int (*setlease)(struct file *, long, struct file_lock **, void **);
        long (*fallocate)(struct file *file, int mode, loff_t offset,
                          loff_t len);
        void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
        unsigned (*mmap_capabilities)(struct file *);
#endif
        ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
                        loff_t, size_t, unsigned int);
        loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
                                   struct file *file_out, loff_t pos_out,
                                   loff_t len, unsigned int remap_flags);
        int (*fadvise)(struct file *, loff_t, loff_t, int);
} __randomize_layout;

read/write/read_iter/write_iter 是读写操作函数
poll 是文件系统 poll 机制的实现
mmap 是为了支持内存映射

普通文件和目录文件对应的 file_operations 是不同的，比如 ext4 文件系统普通文件的 file_operations 定义如下

/// fs/ext4/file.c
const struct file_operations ext4_file_operations = {
        .llseek         = ext4_llseek,
        .read_iter      = ext4_file_read_iter,
        .write_iter     = ext4_file_write_iter,
        .iopoll         = iomap_dio_iopoll,
        .unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl   = ext4_compat_ioctl,
#endif
        .mmap           = ext4_file_mmap,
        .mmap_supported_flags = MAP_SYNC,
        .open           = ext4_file_open,
        .release        = ext4_release_file,
        .fsync          = ext4_sync_file,
        .get_unmapped_area = thp_get_unmapped_area,
        .splice_read    = generic_file_splice_read,
        .splice_write   = iter_file_splice_write,
        .fallocate      = ext4_fallocate,
};

而目录的文件操作

/// fs/ext4/dir.c
const struct file_operations ext4_dir_operations = {
        .llseek         = ext4_dir_llseek,
        .read           = generic_read_dir,
        .iterate_shared = ext4_readdir,
        .unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
        .compat_ioctl   = ext4_compat_ioctl,
#endif
        .fsync          = ext4_sync_file,
        .open           = ext4_dir_open,
        .release        = ext4_release_dir,
};

5、dentry

Linux 系统将目录也当作一个文件，文件内容是文件名或者目录名。

5.1、struct dentry

struct dentry是一种标准化的数据结构，可以表示文件名或目录。它还建立了文件名及其inode之间的关联

/// include/linux/dcache.h
struct dentry {
        /* RCU lookup touched fields */
        unsigned int d_flags;           /* protected by d_lock */
        seqcount_spinlock_t d_seq;      /* per dentry seqlock */
        struct hlist_bl_node d_hash;    // 哈希值，方便查找
        struct dentry *d_parent;        // 父目录
        struct qstr d_name;
        struct inode *d_inode;          // 关联的 inode
        unsigned char d_iname[DNAME_INLINE_LEN]; // 短文件名

        /* Ref lookup also touches following */
        struct lockref d_lockref;       /* per-dentry lock and refcount */
        const struct dentry_operations *d_op; // 目录操作函数
        struct super_block *d_sb;       /* The root of the dentry tree */
        unsigned long d_time;           /* used by d_revalidate */
        void *d_fsdata;                 /* fs-specific data */

        union {
                struct list_head d_lru;   // dentry 缓存
                wait_queue_head_t *d_wait;      /* in-lookup ones only */
        };
        struct list_head d_child;       /* child of parent list */
        struct list_head d_subdirs;     /* our children */
        /*
         * d_alias and d_rcu can share memory
         */
        union {
                struct hlist_node d_alias;      /* inode alias list */
                struct hlist_bl_node d_in_lookup_hash;  /* only for in-lookup ones */
                struct rcu_head d_rcu;
        } d_u;
} __randomize_layout;

5.2、dentry_operations

/// include/linux/dcache.h
struct dentry_operations {
        int (*d_revalidate)(struct dentry *, unsigned int);
        int (*d_weak_revalidate)(struct dentry *, unsigned int);
        int (*d_hash)(const struct dentry *, struct qstr *);
        int (*d_compare)(const struct dentry *,
                        unsigned int, const char *, const struct qstr *);
        int (*d_delete)(const struct dentry *);
        int (*d_init)(struct dentry *);
        void (*d_release)(struct dentry *);
        void (*d_prune)(struct dentry *);
        void (*d_iput)(struct dentry *, struct inode *);
        char *(*d_dname)(struct dentry *, char *, int);
        struct vfsmount *(*d_automount)(struct path *);
        int (*d_manage)(const struct path *, bool);
        struct dentry *(*d_real)(struct dentry *, const struct inode *);
} ____cacheline_aligned;

5.3、ext4_dentry_ops

/// fs/ext4/dir.c
const struct dentry_operations ext4_dentry_ops = {
        .d_hash = generic_ci_d_hash,
        .d_compare = generic_ci_d_compare,
};

6、file

从进程的角度，标识打开的文件。主要维持如下信息。文件描述符（就是整数）用于在一个进程内唯一地标识打开的文件。该整数其实是数组的下标，数组是指针数组，每个数组项指向一个 file 结构实例，管理一个打开文件的所有信息。

struct file 保存了内核所看到的文件的特殊信息，比如

文件读写的标记的位置
打开文件的权限
指向 inode 的指针

struct file 定义如下：

/// include/linux/fs.h
struct file {
        union {
                struct llist_node       fu_llist;
                struct rcu_head         fu_rcuhead;
        } f_u;
        struct path             f_path; // 文件名和 inode 的关系
        struct inode            *f_inode; // 缓存的 inode
        const struct file_operations    *f_op;

        /*
         * Protects f_ep_links, f_flags.
         * Must not be taken from IRQ context.
         */
        spinlock_t              f_lock;
        enum rw_hint            f_write_hint;
        atomic_long_t           f_count;
        unsigned int            f_flags;
        fmode_t                 f_mode; // open 时传入的打开参数
        struct mutex            f_pos_lock;
        loff_t                  f_pos; // 文件读写位置
        struct fown_struct      f_owner;
        const struct cred       *f_cred;
        struct file_ra_state    f_ra;

        u64                     f_version;
#ifdef CONFIG_SECURITY
        void                    *f_security;
#endif
        /* needed for tty driver, and maybe others */
        void                    *private_data;

#ifdef CONFIG_EPOLL
        /* Used by fs/eventpoll.c to link all the hooks to this file */
        struct list_head        f_ep_links;
        struct list_head        f_tfile_llink;
#endif /* #ifdef CONFIG_EPOLL */
        struct address_space    *f_mapping;
        errseq_t                f_wb_err;
        errseq_t                f_sb_err; /* for syncfs */
} __randomize_layout
  __attribute__((aligned(4)));  /* lest something weird decides that 2 is OK */

f_op 缓存的是 inode 的 i_fop 文件操作函数。

7、fs_context

fs_context 是 file_system_type 和 super_block 之间的桥梁，创建和配置 super_block 都离不开 fs_context，主要在 mount 调用时使用。

7.1、fs_context_operations

/// include/linux/fs_context.h
struct fs_context_operations {
        void (*free)(struct fs_context *fc);
        int (*dup)(struct fs_context *fc, struct fs_context *src_fc);
        int (*parse_param)(struct fs_context *fc, struct fs_parameter *param);
        int (*parse_monolithic)(struct fs_context *fc, void *data);
        int (*get_tree)(struct fs_context *fc);
        int (*reconfigure)(struct fs_context *fc);
};

7.2、fs_context

/// include/linux/fs_context.h
struct fs_context {
        const struct fs_context_operations *ops;
        struct mutex            uapi_mutex;     /* Userspace access mutex */
        struct file_system_type *fs_type;
        void                    *fs_private;    /* The filesystem's context */
        void                    *sget_key;
        struct dentry           *root;          /* The root and superblock */
        struct user_namespace   *user_ns;       /* The user namespace for this mount */
        struct net              *net_ns;        /* The network namespace for this mount */
        const struct cred       *cred;          /* The mounter's credentials */
        struct p_log            log;            /* Logging buffer */
        const char              *source;        /* The source name (eg. dev path) */
        void                    *security;      /* Linux S&M options */
        void                    *s_fs_info;     /* Proposed s_fs_info */
        unsigned int            sb_flags;       /* Proposed superblock flags (SB_*) */
        unsigned int            sb_flags_mask;  /* Superblock flags that were changed */
        unsigned int            s_iflags;       /* OR'd with sb->s_iflags */
        unsigned int            lsm_flags;      /* Information flags from the fs to the LSM */
        enum fs_context_purpose purpose:8;
        enum fs_context_phase   phase:8;        /* The phase the context is in */
        bool                    need_free:1;    /* Need to call ops->free() */
        bool                    global:1;       /* Goes into &init_user_ns */
        bool                    oldapi:1;       /* Coming from mount(2) */
};

接下来会分析 mount/read/write 调用的执行过程。

收录于合集 #linux

8个

下一篇学习 Linux | 虚拟文件系统（二）文件系统 mount 以及文件创建 create

喜欢此内容的人还喜欢

学习 Linux | 内存管理（一）干货满满，一文搞懂 Slab 分配器

源知源为

不喜欢

不看的原因

确定

内容质量低
不看此公众号

学习 Linux | 虚拟文件系统（五）IO 复用 poll 改进了什么