0、写在前面
关于如何下手走读 Linux 的源码,每个人都有不同的理解。从 UNIX 一贯而来的“一切皆文件”思想来看,应该从文件系统着手分析,因为许多其他的模块都使用到了文件系统。
为了支持各种各样的文件系统实列,统一各种不同文件系统的接口,Linux 将文件系统公共部分抽象出来,形成了虚拟文件系统(virtual file system, vfs)。
虚拟文件系统是 Linux 内核的一个组件,用于处理与文件和文件系统相关的所有系统调用。VFS 是内核提供文件系统接口给用户态应用程序通用接口层,同时也提供了抽象化操作接口,以便底层各种文件系统实现。
本文是基于 Linux 5.10.127 源码分析,涉及到文件系统实例以 ext4 为例。
1、vfs 总体架构
vfs 涉及到的数据结构比较复杂,比较重要的是下面 5 个:
- file_system_type:表示某种文件系统
- super_block:表示一个文件系统实例
- dentry:表述目录
- inode:索引节点
- file:进程角度打开的文件
上述 5 个数据结构,总体的布局如下
file_system_type 表示一种文件系统,比如 ext2、ext4 等,表示某类型文件系统。Linux 系统中将所有的 file_system_type 结构体都用一个单向链表串联起来,头节点是 file_systems。
而 super_block 代表具体某个已经挂载的文件系统,标识一个文件系统实例。比如某个硬盘是 ext4 文件系统,挂载后就是 ext4 文件系统的一个实例。1)所属同一个 file_system_type 的 super_block 结构体使用单向链表链接;2)系统中所有 super_block 都链接到一个双向链表上,头节点是 super_blocks。
比如 ext4 文件系统,将多个 block 组织成一个 block,每个 block 中 super_block 位于偏移 1024 处。
dentry 表示目录,Linux 将目录也当作文件,也有一个 inode。两者有什么区别呢?区别在于操作函数,dentry 的 inode 操作函数和普通文件操作函数不同。
inode 包含了内核在操作文件或目录时需要的全部信息。对于 UNIX 风格的文件系统,这些信息可以根据需要从磁盘索引结点直接读入或者写回磁盘。磁盘上的一个索引结点代表一个文件,内核中一个 inode 代表打开的一个文件,是打开文件的唯一标识。
file 是从进程角度表示一个打开的文件,不同进程或者同一进程的 file 可以执行同一个 inode。
2、file_system_type
Linux 使用 file_system_type 表示一种文件系统,比如 ext2、ext4、exfat 等等。文件系统可以有多个实例,每个实例都使用 super_block 表征。
关于 fs_context 和 fs_parameter_spec 这里先跳过,后面讲 mount 的时候再分析。
/// include/linux/fs.h
struct file_system_type {
const char *name;
int fs_flags;
#define FS_REQUIRES_DEV 1
#define FS_BINARY_MOUNTDATA 2
#define FS_HAS_SUBTYPE 4
#define FS_USERNS_MOUNT 8 /* Can be mounted by userns root */
#define FS_DISALLOW_NOTIFY_PERM 16 /* Disable fanotify permission events */
#define FS_THP_SUPPORT 8192 /* Remove once all fs converted */
#define FS_RENAME_DOES_D_MOVE 32768 /* FS will handle d_move() during rename() internally. */
int (*init_fs_context)(struct fs_context *);
const struct fs_parameter_spec *parameters;
struct dentry *(*mount) (struct file_system_type *, int,
const char *, void *);
void (*kill_sb) (struct super_block *);
struct module *owner;
struct file_system_type * next;
struct hlist_head fs_supers;
struct lock_class_key s_lock_key;
struct lock_class_key s_umount_key;
struct lock_class_key s_vfs_rename_key;
struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];
struct lock_class_key i_lock_key;
struct lock_class_key i_mutex_key;
struct lock_class_key i_mutex_dir_key;
};
- name 表示文件系统名字,比如 ext2、ext4;
- fs_flags 是一些 FS_ 标志位;
- init_fs_context/parameters 和 fs_context 有关系,挂载的时候用到;
- mount 函数指针指向挂载文件系统实例的函数;
- kill_sb 看起来是和释放 super_block 有关系
- next 指针用于将文件系统链接成链表;
- fs_supers 是链表头,用于链接文件系统的所有实例;
2.1、register_filesystem()/unregister_filesystem()
在挂载某个文件系统实例前,Linux 必须支持该文件系统。换句话说,需要将该文件系统 file_system_type 注册到 Linux。向 Linux 注册文件系统是通过 register_filesystem() 函数。
Linux 使用链表管理注册的文件系统,所有的文件系统都被链接到一个链表上,全局变量 file_systems 指向该链表的第一个元素。
/// fs/filesystem.c
static struct file_system_type *file_systems;
static DEFINE_RWLOCK(file_systems_lock); // 读写锁,并发控制
Linux 不允许同一个文件系统重复注册。find_filesystem() 函数可以查找是否已经注册某个文件系统。find_filesystem() 函数返回的是指针的指针,如果存在,*p 指向该文件系统,否则 *p 为 NULL。
从下面的 for 循环可以看到,指针 p 其实指向的是 next 指针。*p 就是 next 的值。如果 *p 为 NULL,next 的值为 NULL,表示链表已经到最后一个节点了。
/// fs/filesystem.c
static struct file_system_type **find_filesystem(const char *name, unsigned len)
{
struct file_system_type **p;
for (p = &file_systems; *p; p = &(*p)->next)
if (strncmp((*p)->name, name, len) == 0 &&
!(*p)->name[len])
break;
return p;
}
find_filesystem() 函数的处理手法值得我们学习:如何将查找和插入结合,在查找失败时,可以利用查找结果直接插入元素。
register_filesystem() 函数将一个文件系统注册到全局链表 file_systems 上,如果已经存在,则返回 -EBUSY,表示注册失败。其主要逻辑是调用 find_filesystem() 函数查找链表中是否已经存在待注册文件系统,不存在时才将待注册文件系统添加到链表中。
/// fs/filesystem.c
int register_filesystem(struct file_system_type * fs)
{
int res = 0;
struct file_system_type ** p;
if (fs->parameters &&
!fs_validate_description(fs->name, fs->parameters))
return -EINVAL;
BUG_ON(strchr(fs->name, '.'));
if (fs->next) // next 不干净,指向了一个 file_system_type
return -EBUSY;
write_lock(&file_systems_lock); // 互斥访问获取读写锁
p = find_filesystem(fs->name, strlen(fs->name));
if (*p) // 已经存在
res = -EBUSY;
else
*p = fs;
write_unlock(&file_systems_lock); // 释放读写锁
return res;
}
unregister_filesystem() 函数是从全局链表 file_systems 中删除某个文件系统。
/// fs/filesystem.c
int unregister_filesystem(struct file_system_type * fs)
{
struct file_system_type ** tmp;
write_lock(&file_systems_lock); // 获取读写锁
tmp = &file_systems;
while (*tmp) {
if (fs == *tmp) {
*tmp = fs->next; // 从链表移除
fs->next = NULL;
write_unlock(&file_systems_lock);
synchronize_rcu();
return 0;
}
tmp = &(*tmp)->next;
}
write_unlock(&file_systems_lock); // 释放锁
return -EINVAL;
}
2.2、get_fs_type()
get_fs_type() 函数根据名字 name 查找是否注册了某个文件系统,如果注册了就返回指向对应 file_system_type 的指针。
/// fs/filesystem.c
static struct file_system_type *__get_fs_type(const char *name, int len)
{
struct file_system_type *fs;
read_lock(&file_systems_lock);
fs = *(find_filesystem(name, len));
if (fs && !try_module_get(fs->owner))
fs = NULL;
read_unlock(&file_systems_lock);
return fs;
}
struct file_system_type *get_fs_type(const char *name)
{
struct file_system_type *fs;
const char *dot = strchr(name, '.');
int len = dot ? dot - name : strlen(name);
fs = __get_fs_type(name, len);
if (!fs && (request_module("fs-%.*s", len, name) == 0)) {
fs = __get_fs_type(name, len);
if (!fs)
pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n",
len, name);
}
if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
put_filesystem(fs);
fs = NULL;
}
return fs;
}
2.3、ext4_fs_type
ext4 在 module 初始化时调用 register_filesystem() 函数将 ext4_fs_type 注册到 Linux 系统中。ext4_fs_type 定义如下。mount 和 kill_sb 函数指针分别指向 ext4_mount() 和 kill_block_super() 两个函数。
/// fs/ext4/super.c
static struct file_system_type ext4_fs_type = {
.owner = THIS_MODULE,
.name = "ext4",
.mount = ext4_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
};
ext4_mount() 函数直接调用 mount_bdev() 函数。
/// fs/ext4/super.c
static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
const char *dev_name, void *data)
{
return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
}
mount_bdev() 和 kill_block_super() 两个函数都不是 ext4 专有函数,都是通用函数。
3、super_block
vfs 中 super_block 和某个文件系统实例(保存在硬盘)的 super_block 不一样,vfs 中 super_block 记录的是通用信息,不同的文件系统 super_block 记录的信息不一样。Linux 将通用的信息提取出来,使用 super_block 数据结构保存。
super_block 代表了一个具体某个已经挂载的文件系统,标识一个文件系统实例的信息。除了文件系统的关键信息(块长度、最大文件长度,等等)之外,超级块还包含了读、写、操作 inode的函数指针。常用的信息如下所示:
- 依附的物理硬件
- 索引结点 inode 和数据块 block 的位置
- block 的大小(字节)
- 文件系统类型
- 最长文件名
- 最大文件大小
- 根目录的 inode 位置
- 支持的操作
3.1、struct super_block
super_block 结构定义如下,其中的几个对于理解 vfs 架构比较有帮助
- s_list 将所有的 super_block 都链接成双向链表
- s_type 指向所属的 file_system_type
- s_op 指向 super_block 的操作 op,不同的文件系统,op 一般不同
- s_fs_info 记录文件系统
- s_instances 同属于一个 file_system 的 super_block 链接起来
- s_inodes 保存所有的 inode
- s_inodes_wb 保存需要回写的 inode
/// include/linux/fs.h
struct super_block {
struct list_head s_list; // 连接所有 super_block
dev_t s_dev;
unsigned char s_blocksize_bits;
unsigned long s_blocksize; /* 块大小 */
loff_t s_maxbytes; /* 最大文件大小 */
struct file_system_type *s_type; /* 所属文件系统 */
const struct super_operations *s_op; /* super block 操作*/
const struct dquot_operations *dq_op;
const struct quotactl_ops *s_qcop;
const struct export_operations *s_export_op;
unsigned long s_flags;
unsigned long s_iflags; /* internal SB_I_* flags */
unsigned long s_magic;
struct dentry *s_root; /* 挂载点 */
struct rw_semaphore s_umount;
int s_count; /* 引用计数 */
atomic_t s_active;
#ifdef CONFIG_SECURITY
void *s_security;
#endif
const struct xattr_handler **s_xattr;
#ifdef CONFIG_FS_ENCRYPTION
const struct fscrypt_operations *s_cop;
struct key *s_master_keys; /* master crypto keys in use */
#endif
#ifdef CONFIG_FS_VERITY
const struct fsverity_operations *s_vop;
#endif
#ifdef CONFIG_UNICODE
struct unicode_map *s_encoding;
__u16 s_encoding_flags;
#endif
struct hlist_bl_head s_roots; /* alternate root dentries for NFS */
struct list_head s_mounts; /* list of mounts; _not_ for fs use */
struct block_device *s_bdev;
struct backing_dev_info *s_bdi;
struct mtd_info *s_mtd;
struct hlist_node s_instances; // 连接同属一个 filesystem
unsigned int s_quota_types; /* Bitmask of supported quota types */
struct quota_info s_dquot; /* Diskquota specific options */
struct sb_writers s_writers;
/*
* Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
* s_fsnotify_marks together for cache efficiency. They are frequently
* accessed and rarely modified.
*/
void *s_fs_info; /* Filesystem private info */
/* Granularity of c/m/atime in ns (cannot be worse than a second) */
u32 s_time_gran;
/* Time limits for c/m/atime in seconds */
time64_t s_time_min;
time64_t s_time_max;
#ifdef CONFIG_FSNOTIFY
__u32 s_fsnotify_mask;
struct fsnotify_mark_connector __rcu *s_fsnotify_marks;
#endif
char s_id[32]; /* Informational name */
uuid_t s_uuid; /* UUID */
unsigned int s_max_links;
fmode_t s_mode;
/*
* The next field is for VFS *only*. No filesystems have any business
* even looking at it. You had been warned.
*/
struct mutex s_vfs_rename_mutex; /* Kludge */
/*
* Filesystem subtype. If non-empty the filesystem type field
* in /proc/mounts will be "type.subtype"
*/
const char *s_subtype;
const struct dentry_operations *s_d_op; /* default d_op for dentries */
/*
* Saved pool identifier for cleancache (-1 means none)
*/
int cleancache_poolid;
struct shrinker s_shrink; /* per-sb shrinker handle */
/* Number of inodes with nlink == 0 but still referenced */
atomic_long_t s_remove_count;
/* Pending fsnotify inode refs */
atomic_long_t s_fsnotify_inode_refs;
/* Being remounted read-only */
int s_readonly_remount;
/* per-sb errseq_t for reporting writeback errors via syncfs */
errseq_t s_wb_err;
/* AIO completions deferred from interrupt context */
struct workqueue_struct *s_dio_done_wq;
struct hlist_head s_pins;
/*
* Owning user namespace and default context in which to
* interpret filesystem uids, gids, quotas, device nodes,
* xattrs and security labels.
*/
struct user_namespace *s_user_ns;
/*
* The list_lru structure is essentially just a pointer to a table
* of per-node lru lists, each of which has its own spinlock.
* There is no need to put them into separate cachelines.
*/
struct list_lru s_dentry_lru; // 缓存 dentry
struct list_lru s_inode_lru; // 缓存 inode
struct rcu_head rcu;
struct work_struct destroy_work;
struct mutex s_sync_lock; /* sync serialisation lock */
/*
* Indicates how deep in a filesystem stack this SB is
*/
int s_stack_depth;
/* s_inode_list_lock protects s_inodes */
spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp;
struct list_head s_inodes; /* 所有 inode */
spinlock_t s_inode_wblist_lock;
struct list_head s_inodes_wb; /* 回写 inode */
} __randomize_layout;
Linux 中挂载的 super_block 除了链接到对应的 file_system_type::fs_supers 链表上,还将其链接到全局链表 super_blocks 中。
/// fs/super.c
static LIST_HEAD(super_blocks);
static DEFINE_SPINLOCK(sb_lock);
3.2、super_operations
super_operations 中定义了超级块支持的操作,是一组函数指针,指向比如 inode 分配、销毁与释放,以及将 inode 数据写回磁盘等函数。
/// include/linux/fs.h
struct super_operations {
struct inode *(*alloc_inode)(struct super_block *sb); // 分配 inode
void (*destroy_inode)(struct inode *); // 销毁 inode
void (*free_inode)(struct inode *); // 释放 inode
void (*dirty_inode) (struct inode *, int flags);
int (*write_inode) (struct inode *, struct writeback_control *wbc);
int (*drop_inode) (struct inode *);
void (*evict_inode) (struct inode *);
void (*put_super) (struct super_block *);
int (*sync_fs)(struct super_block *sb, int wait);
int (*freeze_super) (struct super_block *);
int (*freeze_fs) (struct super_block *);
int (*thaw_super) (struct super_block *);
int (*unfreeze_fs) (struct super_block *);
int (*statfs) (struct dentry *, struct kstatfs *);
int (*remount_fs) (struct super_block *, int *, char *);
void (*umount_begin) (struct super_block *);
int (*show_options)(struct seq_file *, struct dentry *);
int (*show_devname)(struct seq_file *, struct dentry *);
int (*show_path)(struct seq_file *, struct dentry *);
int (*show_stats)(struct seq_file *, struct dentry *);
#ifdef CONFIG_QUOTA
ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
struct dquot **(*get_dquots)(struct inode *);
#endif
int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
long (*nr_cached_objects)(struct super_block *,
struct shrink_control *);
long (*free_cached_objects)(struct super_block *,
struct shrink_control *);
};
3.3、ext4_sops
比如 ext4 文件系统,实现的 super_operations 如下 ext4_sops 所示。
/// fs/ext4/super.c
static const struct super_operations ext4_sops = {
.alloc_inode = ext4_alloc_inode,
.free_inode = ext4_free_in_core_inode,
.destroy_inode = ext4_destroy_inode,
.write_inode = ext4_write_inode,
.dirty_inode = ext4_dirty_inode,
.drop_inode = ext4_drop_inode,
.evict_inode = ext4_evict_inode,
.put_super = ext4_put_super,
.sync_fs = ext4_sync_fs,
.freeze_fs = ext4_freeze,
.unfreeze_fs = ext4_unfreeze,
.statfs = ext4_statfs,
.remount_fs = ext4_remount,
.show_options = ext4_show_options,
#ifdef CONFIG_QUOTA
.quota_read = ext4_quota_read,
.quota_write = ext4_quota_write,
.get_dquots = ext4_get_dquots,
#endif
.bdev_try_to_free_page = bdev_try_to_free_page,
};
4、inode
要访问一个文件,一定要通过它的 inode 才能知道这个文件是什么类型的文件、是怎么组织的、文件中存储着多少数据、这些数据在什么地方以及其下层的驱动程序在哪等必要的信息。
每个 inode 都有一个唯一的整数,在每个文件系统实例里 inode 编号唯一。索引结点 inode 包含了内核在操作文件或目录时(目录也被当作文件看待)需要的全部信息。对于 UNIX 风格的文件系统,这些信息可以根据需要从磁盘索引结点直接读入或者写回磁盘。磁盘上的一个索引结点代表一个文件,内核中一个 inode 代表打开的一个文件。
4.1、struct inode
大部分成员用于管理简单的状态信息。例如 i_atime、i_mtime、t_ctime 分别存储了最后访问的时间、最后修改的时间、最后修改 inode 的时间。
文件访问权限和所有权保存在 i_mode(文件类型和访问权限)、i_uid 和 i_gid(与该文件相关的 UID 和 GID)中。
文件长度保存在 i_size,按字节计算。
文件和目录都有一个 inode,但是 i_op 指向的 inode_operations 实现却是不同。
/// include/linux/fs.h
struct inode {
umode_t i_mode;
unsigned short i_opflags;
kuid_t i_uid;
kgid_t i_gid;
unsigned int i_flags;
#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
#endif
const struct inode_operations *i_op; // 操作函数
struct super_block *i_sb; // 所属的 super_block
struct address_space *i_mapping;
#ifdef CONFIG_SECURITY
void *i_security;
#endif
unsigned long i_ino; // 每个 inode 唯一编号
union {
const unsigned int i_nlink; // 使用 inode 的计数
unsigned int __i_nlink;
};
dev_t i_rdev; // 绑定的设备
loff_t i_size; // 文件大小
struct timespec64 i_atime; // 最后访问时间
struct timespec64 i_mtime; // 最后修改时间
struct timespec64 i_ctime; // 最后修改
spinlock_t i_lock;
unsigned short i_bytes;
u8 i_blkbits;
u8 i_write_hint;
blkcnt_t i_blocks; // 文件长度,block 个数
#ifdef __NEED_I_SIZE_ORDERED
seqcount_t i_size_seqcount;
#endif
/* Misc */
unsigned long i_state;
struct rw_semaphore i_rwsem;
unsigned long dirtied_when; /* jiffies of first dirtying */
unsigned long dirtied_time_when;
struct hlist_node i_hash;
struct list_head i_io_list; /* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACK
struct bdi_writeback *i_wb; /* the associated cgroup wb */
/* foreign inode detection, see wbc_detach_inode() */
int i_wb_frn_winner;
u16 i_wb_frn_avg_time;
u16 i_wb_frn_history;
#endif
struct list_head i_lru; /* inode LRU list */
struct list_head i_sb_list;
struct list_head i_wb_list; /* backing dev writeback list */
union {
struct hlist_head i_dentry;
struct rcu_head i_rcu;
};
atomic64_t i_version;
atomic64_t i_sequence; /* see futex */
atomic_t i_count;
atomic_t i_dio_count;
atomic_t i_writecount;
#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
atomic_t i_readcount; /* struct files open RO */
#endif
union {
const struct file_operations *i_fop; // 文件操作
void (*free_inode)(struct inode *);
};
struct file_lock_context *i_flctx;
struct address_space i_data;
struct list_head i_devices;
union {
struct pipe_inode_info *i_pipe; // 表示 pipe
struct block_device *i_bdev; // 表示块设备
struct cdev *i_cdev; // 表示字符设备
char *i_link; // 表示链接
unsigned i_dir_seq;
};
__u32 i_generation;
#ifdef CONFIG_FSNOTIFY
__u32 i_fsnotify_mask; /* all events this inode cares about */
struct fsnotify_mark_connector __rcu *i_fsnotify_marks;
#endif
#ifdef CONFIG_FS_ENCRYPTION
struct fscrypt_info *i_crypt_info;
#endif
#ifdef CONFIG_FS_VERITY
struct fsverity_info *i_verity_info;
#endif
void *i_private; /* fs or device private pointer */
} __randomize_layout;
4.2、inode_operations
大多数情况下,各个函数指针成员的语义可以根据其名称推断。它们与对应的系统调用和用于空间工具在名称非常相似。例如,rmdir 删除目录,rename 重命名文件系统对象,等等。
尽管如此,并非所有名称有熟悉的命令对应。
/// include/linux/fs.h
struct inode_operations {
struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
int (*permission) (struct inode *, int);
struct posix_acl * (*get_acl)(struct inode *, int);
int (*readlink) (struct dentry *, char __user *,int);
int (*create) (struct inode *,struct dentry *, umode_t, bool);
int (*link) (struct dentry *,struct inode *,struct dentry *);
int (*unlink) (struct inode *,struct dentry *);
int (*symlink) (struct inode *,struct dentry *,const char *);
int (*mkdir) (struct inode *,struct dentry *,umode_t);
int (*rmdir) (struct inode *,struct dentry *);
int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
int (*setattr) (struct dentry *, struct iattr *);
int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
ssize_t (*listxattr) (struct dentry *, char *, size_t);
int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
u64 len);
int (*update_time)(struct inode *, struct timespec64 *, int);
int (*atomic_open)(struct inode *, struct dentry *,
struct file *, unsigned open_flag,
umode_t create_mode);
int (*tmpfile) (struct inode *, struct dentry *, umode_t);
int (*set_acl)(struct inode *, struct posix_acl *, int);
} ____cacheline_aligned;
普通文件和目录文件的 inode_operations 不一样。Linux 将目录也当作文件,比如 ext4 文件对应的 inode_operatios 定义如下:
/// fs/ext4/file.c
const struct inode_operations ext4_file_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_file_getattr,
.listxattr = ext4_listxattr,
.get_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
};
而目录文件的 inode_operations 支持的函数比普通多,定义如下:
/// fs/ext4/namei.c
const struct inode_operations ext4_dir_inode_operations = {
.create = ext4_create,
.lookup = ext4_lookup,
.link = ext4_link,
.unlink = ext4_unlink,
.symlink = ext4_symlink,
.mkdir = ext4_mkdir,
.rmdir = ext4_rmdir,
.mknod = ext4_mknod,
.tmpfile = ext4_tmpfile,
.rename = ext4_rename2,
.setattr = ext4_setattr,
.getattr = ext4_getattr,
.listxattr = ext4_listxattr,
.get_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
};
某些特殊文件的 inode_operations 定义如下
/// fs/ext4/namei.c
const struct inode_operations ext4_special_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
.listxattr = ext4_listxattr,
.get_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
};
4.3、file_operations
文件不能只存储信息,必须容许操作其中的信息。从用户的角度来看,文件操作有标准库的函数执行。这些函数指示内核执行系统调用,然后系统调用执行所需的操作。
当然各个文件系统的实现接口不同,因而用于抽象文件操作的结构必须尽可能通用,以考虑到各种各样的目标文件。同时,它不能带有过多只适用于特定文件类型的专门操作。
各个 inode 实例都包含一个指向 file_operations 实例的指针 i_fop,该结构保存了指向所有可能文件操作的函数指针。该结构定义如下:
/// include/linux/fs.h
struct file_operations {
struct module *owner;
loff_t (*llseek) (struct file *, loff_t, int);
ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iopoll)(struct kiocb *kiocb, bool spin);
int (*iterate) (struct file *, struct dir_context *);
int (*iterate_shared) (struct file *, struct dir_context *);
__poll_t (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
unsigned long mmap_supported_flags;
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id);
int (*release) (struct inode *, struct file *);
int (*fsync) (struct file *, loff_t, loff_t, int datasync);
int (*fasync) (int, struct file *, int);
int (*lock) (struct file *, int, struct file_lock *);
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
int (*check_flags)(int);
int (*flock) (struct file *, int, struct file_lock *);
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
int (*setlease)(struct file *, long, struct file_lock **, void **);
long (*fallocate)(struct file *file, int mode, loff_t offset,
loff_t len);
void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
unsigned (*mmap_capabilities)(struct file *);
#endif
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
loff_t, size_t, unsigned int);
loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);
} __randomize_layout;
- read/write/read_iter/write_iter 是读写操作函数
- poll 是文件系统 poll 机制的实现
- mmap 是为了支持内存映射
普通文件和目录文件对应的 file_operations 是不同的,比如 ext4 文件系统普通文件的 file_operations 定义如下
/// fs/ext4/file.c
const struct file_operations ext4_file_operations = {
.llseek = ext4_llseek,
.read_iter = ext4_file_read_iter,
.write_iter = ext4_file_write_iter,
.iopoll = iomap_dio_iopoll,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
.mmap = ext4_file_mmap,
.mmap_supported_flags = MAP_SYNC,
.open = ext4_file_open,
.release = ext4_release_file,
.fsync = ext4_sync_file,
.get_unmapped_area = thp_get_unmapped_area,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.fallocate = ext4_fallocate,
};
而目录的文件操作
/// fs/ext4/dir.c
const struct file_operations ext4_dir_operations = {
.llseek = ext4_dir_llseek,
.read = generic_read_dir,
.iterate_shared = ext4_readdir,
.unlocked_ioctl = ext4_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext4_compat_ioctl,
#endif
.fsync = ext4_sync_file,
.open = ext4_dir_open,
.release = ext4_release_dir,
};
5、dentry
Linux 系统将目录也当作一个文件,文件内容是文件名或者目录名。
5.1、struct dentry
struct dentry是一种标准化的数据结构,可以表示文件名或目录。它还建立了文件名及其inode之间的关联
/// include/linux/dcache.h
struct dentry {
/* RCU lookup touched fields */
unsigned int d_flags; /* protected by d_lock */
seqcount_spinlock_t d_seq; /* per dentry seqlock */
struct hlist_bl_node d_hash; // 哈希值,方便查找
struct dentry *d_parent; // 父目录
struct qstr d_name;
struct inode *d_inode; // 关联的 inode
unsigned char d_iname[DNAME_INLINE_LEN]; // 短文件名
/* Ref lookup also touches following */
struct lockref d_lockref; /* per-dentry lock and refcount */
const struct dentry_operations *d_op; // 目录操作函数
struct super_block *d_sb; /* The root of the dentry tree */
unsigned long d_time; /* used by d_revalidate */
void *d_fsdata; /* fs-specific data */
union {
struct list_head d_lru; // dentry 缓存
wait_queue_head_t *d_wait; /* in-lookup ones only */
};
struct list_head d_child; /* child of parent list */
struct list_head d_subdirs; /* our children */
/*
* d_alias and d_rcu can share memory
*/
union {
struct hlist_node d_alias; /* inode alias list */
struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */
struct rcu_head d_rcu;
} d_u;
} __randomize_layout;
5.2、dentry_operations
/// include/linux/dcache.h
struct dentry_operations {
int (*d_revalidate)(struct dentry *, unsigned int);
int (*d_weak_revalidate)(struct dentry *, unsigned int);
int (*d_hash)(const struct dentry *, struct qstr *);
int (*d_compare)(const struct dentry *,
unsigned int, const char *, const struct qstr *);
int (*d_delete)(const struct dentry *);
int (*d_init)(struct dentry *);
void (*d_release)(struct dentry *);
void (*d_prune)(struct dentry *);
void (*d_iput)(struct dentry *, struct inode *);
char *(*d_dname)(struct dentry *, char *, int);
struct vfsmount *(*d_automount)(struct path *);
int (*d_manage)(const struct path *, bool);
struct dentry *(*d_real)(struct dentry *, const struct inode *);
} ____cacheline_aligned;
5.3、ext4_dentry_ops
/// fs/ext4/dir.c
const struct dentry_operations ext4_dentry_ops = {
.d_hash = generic_ci_d_hash,
.d_compare = generic_ci_d_compare,
};
6、file
从进程的角度,标识打开的文件。主要维持如下信息。文件描述符(就是整数)用于在一个进程内唯一地标识打开的文件。该整数其实是数组的下标,数组是指针数组,每个数组项指向一个 file 结构实例,管理一个打开文件的所有信息。
struct file 保存了内核所看到的文件的特殊信息,比如
- 文件读写的标记的位置
- 打开文件的权限
- 指向 inode 的指针
struct file 定义如下:
/// include/linux/fs.h
struct file {
union {
struct llist_node fu_llist;
struct rcu_head fu_rcuhead;
} f_u;
struct path f_path; // 文件名和 inode 的关系
struct inode *f_inode; // 缓存的 inode
const struct file_operations *f_op;
/*
* Protects f_ep_links, f_flags.
* Must not be taken from IRQ context.
*/
spinlock_t f_lock;
enum rw_hint f_write_hint;
atomic_long_t f_count;
unsigned int f_flags;
fmode_t f_mode; // open 时传入的打开参数
struct mutex f_pos_lock;
loff_t f_pos; // 文件读写位置
struct fown_struct f_owner;
const struct cred *f_cred;
struct file_ra_state f_ra;
u64 f_version;
#ifdef CONFIG_SECURITY
void *f_security;
#endif
/* needed for tty driver, and maybe others */
void *private_data;
#ifdef CONFIG_EPOLL
/* Used by fs/eventpoll.c to link all the hooks to this file */
struct list_head f_ep_links;
struct list_head f_tfile_llink;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
errseq_t f_wb_err;
errseq_t f_sb_err; /* for syncfs */
} __randomize_layout
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
f_op 缓存的是 inode 的 i_fop 文件操作函数。
7、fs_context
fs_context 是 file_system_type 和 super_block 之间的桥梁,创建和配置 super_block 都离不开 fs_context,主要在 mount 调用时使用。
7.1、fs_context_operations
/// include/linux/fs_context.h
struct fs_context_operations {
void (*free)(struct fs_context *fc);
int (*dup)(struct fs_context *fc, struct fs_context *src_fc);
int (*parse_param)(struct fs_context *fc, struct fs_parameter *param);
int (*parse_monolithic)(struct fs_context *fc, void *data);
int (*get_tree)(struct fs_context *fc);
int (*reconfigure)(struct fs_context *fc);
};
7.2、fs_context
/// include/linux/fs_context.h
struct fs_context {
const struct fs_context_operations *ops;
struct mutex uapi_mutex; /* Userspace access mutex */
struct file_system_type *fs_type;
void *fs_private; /* The filesystem's context */
void *sget_key;
struct dentry *root; /* The root and superblock */
struct user_namespace *user_ns; /* The user namespace for this mount */
struct net *net_ns; /* The network namespace for this mount */
const struct cred *cred; /* The mounter's credentials */
struct p_log log; /* Logging buffer */
const char *source; /* The source name (eg. dev path) */
void *security; /* Linux S&M options */
void *s_fs_info; /* Proposed s_fs_info */
unsigned int sb_flags; /* Proposed superblock flags (SB_*) */
unsigned int sb_flags_mask; /* Superblock flags that were changed */
unsigned int s_iflags; /* OR'd with sb->s_iflags */
unsigned int lsm_flags; /* Information flags from the fs to the LSM */
enum fs_context_purpose purpose:8;
enum fs_context_phase phase:8; /* The phase the context is in */
bool need_free:1; /* Need to call ops->free() */
bool global:1; /* Goes into &init_user_ns */
bool oldapi:1; /* Coming from mount(2) */
};
接下来会分析 mount/read/write 调用的执行过程。
收录于合集 #linux
8个
下一篇学习 Linux | 虚拟文件系统(二)文件系统 mount 以及文件创建 create
喜欢此内容的人还喜欢
学习 Linux | 内存管理(一)干货满满,一文搞懂 Slab 分配器
源知源为
不喜欢
不看的原因
确定
- 内容质量低
- 不看此公众号
学习 Linux | 虚拟文件系统(五)IO 复用 poll 改进了什么
源知源为
不喜欢
不看的原因
确定
- 内容质量低
- 不看此公众号