Merge rsync://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Conflicts: include/linux/kernel.h
This commit is contained in:
@@ -99,7 +99,7 @@ prototypes:
|
||||
int (*sync_fs)(struct super_block *sb, int wait);
|
||||
void (*write_super_lockfs) (struct super_block *);
|
||||
void (*unlockfs) (struct super_block *);
|
||||
int (*statfs) (struct super_block *, struct kstatfs *);
|
||||
int (*statfs) (struct dentry *, struct kstatfs *);
|
||||
int (*remount_fs) (struct super_block *, int *, char *);
|
||||
void (*clear_inode) (struct inode *);
|
||||
void (*umount_begin) (struct super_block *);
|
||||
@@ -142,15 +142,16 @@ see also dquot_operations section.
|
||||
|
||||
--------------------------- file_system_type ---------------------------
|
||||
prototypes:
|
||||
struct super_block *(*get_sb) (struct file_system_type *, int,
|
||||
const char *, void *);
|
||||
struct int (*get_sb) (struct file_system_type *, int,
|
||||
const char *, void *, struct vfsmount *);
|
||||
void (*kill_sb) (struct super_block *);
|
||||
locking rules:
|
||||
may block BKL
|
||||
get_sb yes yes
|
||||
kill_sb yes yes
|
||||
|
||||
->get_sb() returns error or a locked superblock (exclusive on ->s_umount).
|
||||
->get_sb() returns error or 0 with locked superblock attached to the vfsmount
|
||||
(exclusive on ->s_umount).
|
||||
->kill_sb() takes a write-locked superblock, does all shutdown work on it,
|
||||
unlocks and drops the reference.
|
||||
|
||||
|
@@ -19,7 +19,7 @@ following procedure:
|
||||
|
||||
(2) Have the follow_link() op do the following steps:
|
||||
|
||||
(a) Call do_kern_mount() to call the appropriate filesystem to set up a
|
||||
(a) Call vfs_kern_mount() to call the appropriate filesystem to set up a
|
||||
superblock and gain a vfsmount structure representing it.
|
||||
|
||||
(b) Copy the nameidata provided as an argument and substitute the dentry
|
||||
|
@@ -264,6 +264,15 @@ static struct config_item_type simple_child_type = {
|
||||
};
|
||||
|
||||
|
||||
struct simple_children {
|
||||
struct config_group group;
|
||||
};
|
||||
|
||||
static inline struct simple_children *to_simple_children(struct config_item *item)
|
||||
{
|
||||
return item ? container_of(to_config_group(item), struct simple_children, group) : NULL;
|
||||
}
|
||||
|
||||
static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
|
||||
{
|
||||
struct simple_child *simple_child;
|
||||
@@ -304,7 +313,13 @@ static ssize_t simple_children_attr_show(struct config_item *item,
|
||||
"items have only one attribute that is readable and writeable.\n");
|
||||
}
|
||||
|
||||
static void simple_children_release(struct config_item *item)
|
||||
{
|
||||
kfree(to_simple_children(item));
|
||||
}
|
||||
|
||||
static struct configfs_item_operations simple_children_item_ops = {
|
||||
.release = simple_children_release,
|
||||
.show_attribute = simple_children_attr_show,
|
||||
};
|
||||
|
||||
@@ -345,10 +360,6 @@ static struct configfs_subsystem simple_children_subsys = {
|
||||
* children of its own.
|
||||
*/
|
||||
|
||||
struct simple_children {
|
||||
struct config_group group;
|
||||
};
|
||||
|
||||
static struct config_group *group_children_make_group(struct config_group *group, const char *name)
|
||||
{
|
||||
struct simple_children *simple_children;
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,40 +0,0 @@
|
||||
Device File System (devfs) ToDo List
|
||||
|
||||
Richard Gooch <rgooch@atnf.csiro.au>
|
||||
|
||||
3-JUL-2000
|
||||
|
||||
This is a list of things to be done for better devfs support in the
|
||||
Linux kernel. If you'd like to contribute to the devfs, please have a
|
||||
look at this list for anything that is unallocated. Also, if there are
|
||||
items missing (surely), please contact me so I can add them to the
|
||||
list (preferably with your name attached to them:-).
|
||||
|
||||
|
||||
- >256 ptys
|
||||
Thanks to C. Scott Ananian <cananian@alumni.princeton.edu>
|
||||
|
||||
- Amiga floppy driver (drivers/block/amiflop.c)
|
||||
|
||||
- Atari floppy driver (drivers/block/ataflop.c)
|
||||
|
||||
- SWIM3 (Super Woz Integrated Machine 3) floppy driver (drivers/block/swim3.c)
|
||||
|
||||
- Amiga ZorroII ramdisc driver (drivers/block/z2ram.c)
|
||||
|
||||
- Parallel port ATAPI CD-ROM (drivers/block/paride/pcd.c)
|
||||
|
||||
- Parallel port ATAPI floppy (drivers/block/paride/pf.c)
|
||||
|
||||
- AP1000 block driver (drivers/ap1000/ap.c, drivers/ap1000/ddv.c)
|
||||
|
||||
- Archimedes floppy (drivers/acorn/block/fd1772.c)
|
||||
|
||||
- MFM hard drive (drivers/acorn/block/mfmhd.c)
|
||||
|
||||
- I2O block device (drivers/message/i2o/i2o_block.c)
|
||||
|
||||
- ST-RAM device (arch/m68k/atari/stram.c)
|
||||
|
||||
- Raw devices
|
||||
|
@@ -1,65 +0,0 @@
|
||||
/* -*- auto-fill -*- */
|
||||
|
||||
Device File System (devfs) Boot Options
|
||||
|
||||
Richard Gooch <rgooch@atnf.csiro.au>
|
||||
|
||||
18-AUG-2001
|
||||
|
||||
|
||||
When CONFIG_DEVFS_DEBUG is enabled, you can pass several boot options
|
||||
to the kernel to debug devfs. The boot options are prefixed by
|
||||
"devfs=", and are separated by commas. Spaces are not allowed. The
|
||||
syntax looks like this:
|
||||
|
||||
devfs=<option1>,<option2>,<option3>
|
||||
|
||||
and so on. For example, if you wanted to turn on debugging for module
|
||||
load requests and device registration, you would do:
|
||||
|
||||
devfs=dmod,dreg
|
||||
|
||||
You may prefix "no" to any option. This will invert the option.
|
||||
|
||||
|
||||
Debugging Options
|
||||
=================
|
||||
|
||||
These requires CONFIG_DEVFS_DEBUG to be enabled.
|
||||
Note that all debugging options have 'd' as the first character. By
|
||||
default all options are off. All debugging output is sent to the
|
||||
kernel logs. The debugging options do not take effect until the devfs
|
||||
version message appears (just prior to the root filesystem being
|
||||
mounted).
|
||||
|
||||
These are the options:
|
||||
|
||||
dmod print module load requests to <request_module>
|
||||
|
||||
dreg print device register requests to <devfs_register>
|
||||
|
||||
dunreg print device unregister requests to <devfs_unregister>
|
||||
|
||||
dchange print device change requests to <devfs_set_flags>
|
||||
|
||||
dilookup print inode lookup requests
|
||||
|
||||
diget print VFS inode allocations
|
||||
|
||||
diunlink print inode unlinks
|
||||
|
||||
dichange print inode changes
|
||||
|
||||
dimknod print calls to mknod(2)
|
||||
|
||||
dall some debugging turned on
|
||||
|
||||
|
||||
Other Options
|
||||
=============
|
||||
|
||||
These control the default behaviour of devfs. The options are:
|
||||
|
||||
mount mount devfs onto /dev at boot time
|
||||
|
||||
only disable non-devfs device nodes for devfs-capable drivers
|
@@ -113,6 +113,14 @@ noquota
|
||||
grpquota
|
||||
usrquota
|
||||
|
||||
bh (*) ext3 associates buffer heads to data pages to
|
||||
nobh (a) cache disk block mapping information
|
||||
(b) link pages into transaction to provide
|
||||
ordering guarantees.
|
||||
"bh" option forces use of buffer heads.
|
||||
"nobh" option tries to avoid associating buffer
|
||||
heads (supported only for "writeback" mode).
|
||||
|
||||
|
||||
Specification
|
||||
=============
|
||||
|
@@ -18,6 +18,14 @@ Non-privileged mount (or user mount):
|
||||
user. NOTE: this is not the same as mounts allowed with the "user"
|
||||
option in /etc/fstab, which is not discussed here.
|
||||
|
||||
Filesystem connection:
|
||||
|
||||
A connection between the filesystem daemon and the kernel. The
|
||||
connection exists until either the daemon dies, or the filesystem is
|
||||
umounted. Note that detaching (or lazy umounting) the filesystem
|
||||
does _not_ break the connection, in this case it will exist until
|
||||
the last reference to the filesystem is released.
|
||||
|
||||
Mount owner:
|
||||
|
||||
The user who does the mounting.
|
||||
@@ -86,16 +94,20 @@ Mount options
|
||||
The default is infinite. Note that the size of read requests is
|
||||
limited anyway to 32 pages (which is 128kbyte on i386).
|
||||
|
||||
Sysfs
|
||||
~~~~~
|
||||
Control filesystem
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
FUSE sets up the following hierarchy in sysfs:
|
||||
There's a control filesystem for FUSE, which can be mounted by:
|
||||
|
||||
/sys/fs/fuse/connections/N/
|
||||
mount -t fusectl none /sys/fs/fuse/connections
|
||||
|
||||
where N is an increasing number allocated to each new connection.
|
||||
Mounting it under the '/sys/fs/fuse/connections' directory makes it
|
||||
backwards compatible with earlier versions.
|
||||
|
||||
For each connection the following attributes are defined:
|
||||
Under the fuse control filesystem each connection has a directory
|
||||
named by a unique number.
|
||||
|
||||
For each connection the following files exist within this directory:
|
||||
|
||||
'waiting'
|
||||
|
||||
@@ -110,7 +122,47 @@ For each connection the following attributes are defined:
|
||||
connection. This means that all waiting requests will be aborted an
|
||||
error returned for all aborted and new requests.
|
||||
|
||||
Only a privileged user may read or write these attributes.
|
||||
Only the owner of the mount may read or write these files.
|
||||
|
||||
Interrupting filesystem operations
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
If a process issuing a FUSE filesystem request is interrupted, the
|
||||
following will happen:
|
||||
|
||||
1) If the request is not yet sent to userspace AND the signal is
|
||||
fatal (SIGKILL or unhandled fatal signal), then the request is
|
||||
dequeued and returns immediately.
|
||||
|
||||
2) If the request is not yet sent to userspace AND the signal is not
|
||||
fatal, then an 'interrupted' flag is set for the request. When
|
||||
the request has been successfully transfered to userspace and
|
||||
this flag is set, an INTERRUPT request is queued.
|
||||
|
||||
3) If the request is already sent to userspace, then an INTERRUPT
|
||||
request is queued.
|
||||
|
||||
INTERRUPT requests take precedence over other requests, so the
|
||||
userspace filesystem will receive queued INTERRUPTs before any others.
|
||||
|
||||
The userspace filesystem may ignore the INTERRUPT requests entirely,
|
||||
or may honor them by sending a reply to the _original_ request, with
|
||||
the error set to EINTR.
|
||||
|
||||
It is also possible that there's a race between processing the
|
||||
original request and it's INTERRUPT request. There are two possibilities:
|
||||
|
||||
1) The INTERRUPT request is processed before the original request is
|
||||
processed
|
||||
|
||||
2) The INTERRUPT request is processed after the original request has
|
||||
been answered
|
||||
|
||||
If the filesystem cannot find the original request, it should wait for
|
||||
some timeout and/or a number of new requests to arrive, after which it
|
||||
should reply to the INTERRUPT request with an EAGAIN error. In case
|
||||
1) the INTERRUPT request will be requeued. In case 2) the INTERRUPT
|
||||
reply will be ignored.
|
||||
|
||||
Aborting a filesystem connection
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@@ -139,8 +191,8 @@ the filesystem. There are several ways to do this:
|
||||
- Use forced umount (umount -f). Works in all cases but only if
|
||||
filesystem is still attached (it hasn't been lazy unmounted)
|
||||
|
||||
- Abort filesystem through the sysfs interface. Most powerful
|
||||
method, always works.
|
||||
- Abort filesystem through the FUSE control filesystem. Most
|
||||
powerful method, always works.
|
||||
|
||||
How do non-privileged mounts work?
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
@@ -304,25 +356,7 @@ Scenario 1 - Simple deadlock
|
||||
| | for "file"]
|
||||
| | *DEADLOCK*
|
||||
|
||||
The solution for this is to allow requests to be interrupted while
|
||||
they are in userspace:
|
||||
|
||||
| [interrupted by signal] |
|
||||
| <fuse_unlink() |
|
||||
| [release semaphore] | [semaphore acquired]
|
||||
| <sys_unlink() |
|
||||
| | >fuse_unlink()
|
||||
| | [queue req on fc->pending]
|
||||
| | [wake up fc->waitq]
|
||||
| | [sleep on req->waitq]
|
||||
|
||||
If the filesystem daemon was single threaded, this will stop here,
|
||||
since there's no other thread to dequeue and execute the request.
|
||||
In this case the solution is to kill the FUSE daemon as well. If
|
||||
there are multiple serving threads, you just have to kill them as
|
||||
long as any remain.
|
||||
|
||||
Moral: a filesystem which deadlocks, can soon find itself dead.
|
||||
The solution for this is to allow the filesystem to be aborted.
|
||||
|
||||
Scenario 2 - Tricky deadlock
|
||||
----------------------------
|
||||
@@ -355,24 +389,14 @@ but is caused by a pagefault.
|
||||
| | [lock page]
|
||||
| | * DEADLOCK *
|
||||
|
||||
Solution is again to let the the request be interrupted (not
|
||||
elaborated further).
|
||||
Solution is basically the same as above.
|
||||
|
||||
An additional problem is that while the write buffer is being
|
||||
copied to the request, the request must not be interrupted. This
|
||||
is because the destination address of the copy may not be valid
|
||||
after the request is interrupted.
|
||||
An additional problem is that while the write buffer is being copied
|
||||
to the request, the request must not be interrupted/aborted. This is
|
||||
because the destination address of the copy may not be valid after the
|
||||
request has returned.
|
||||
|
||||
This is solved with doing the copy atomically, and allowing
|
||||
interruption while the page(s) belonging to the write buffer are
|
||||
faulted with get_user_pages(). The 'req->locked' flag indicates
|
||||
when the copy is taking place, and interruption is delayed until
|
||||
this flag is unset.
|
||||
|
||||
Scenario 3 - Tricky deadlock with asynchronous read
|
||||
---------------------------------------------------
|
||||
|
||||
The same situation as above, except thread-1 will wait on page lock
|
||||
and hence it will be uninterruptible as well. The solution is to
|
||||
abort the connection with forced umount (if mount is attached) or
|
||||
through the abort attribute in sysfs.
|
||||
This is solved with doing the copy atomically, and allowing abort
|
||||
while the page(s) belonging to the write buffer are faulted with
|
||||
get_user_pages(). The 'req->locked' flag indicates when the copy is
|
||||
taking place, and abort is delayed until this flag is unset.
|
||||
|
@@ -69,17 +69,135 @@ Prototypes:
|
||||
int inotify_rm_watch (int fd, __u32 mask);
|
||||
|
||||
|
||||
(iii) Internal Kernel Implementation
|
||||
(iii) Kernel Interface
|
||||
|
||||
Each inotify instance is associated with an inotify_device structure.
|
||||
Inotify's kernel API consists a set of functions for managing watches and an
|
||||
event callback.
|
||||
|
||||
To use the kernel API, you must first initialize an inotify instance with a set
|
||||
of inotify_operations. You are given an opaque inotify_handle, which you use
|
||||
for any further calls to inotify.
|
||||
|
||||
struct inotify_handle *ih = inotify_init(my_event_handler);
|
||||
|
||||
You must provide a function for processing events and a function for destroying
|
||||
the inotify watch.
|
||||
|
||||
void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
|
||||
u32 cookie, const char *name, struct inode *inode)
|
||||
|
||||
watch - the pointer to the inotify_watch that triggered this call
|
||||
wd - the watch descriptor
|
||||
mask - describes the event that occurred
|
||||
cookie - an identifier for synchronizing events
|
||||
name - the dentry name for affected files in a directory-based event
|
||||
inode - the affected inode in a directory-based event
|
||||
|
||||
void destroy_watch(struct inotify_watch *watch)
|
||||
|
||||
You may add watches by providing a pre-allocated and initialized inotify_watch
|
||||
structure and specifying the inode to watch along with an inotify event mask.
|
||||
You must pin the inode during the call. You will likely wish to embed the
|
||||
inotify_watch structure in a structure of your own which contains other
|
||||
information about the watch. Once you add an inotify watch, it is immediately
|
||||
subject to removal depending on filesystem events. You must grab a reference if
|
||||
you depend on the watch hanging around after the call.
|
||||
|
||||
inotify_init_watch(&my_watch->iwatch);
|
||||
inotify_get_watch(&my_watch->iwatch); // optional
|
||||
s32 wd = inotify_add_watch(ih, &my_watch->iwatch, inode, mask);
|
||||
inotify_put_watch(&my_watch->iwatch); // optional
|
||||
|
||||
You may use the watch descriptor (wd) or the address of the inotify_watch for
|
||||
other inotify operations. You must not directly read or manipulate data in the
|
||||
inotify_watch. Additionally, you must not call inotify_add_watch() more than
|
||||
once for a given inotify_watch structure, unless you have first called either
|
||||
inotify_rm_watch() or inotify_rm_wd().
|
||||
|
||||
To determine if you have already registered a watch for a given inode, you may
|
||||
call inotify_find_watch(), which gives you both the wd and the watch pointer for
|
||||
the inotify_watch, or an error if the watch does not exist.
|
||||
|
||||
wd = inotify_find_watch(ih, inode, &watchp);
|
||||
|
||||
You may use container_of() on the watch pointer to access your own data
|
||||
associated with a given watch. When an existing watch is found,
|
||||
inotify_find_watch() bumps the refcount before releasing its locks. You must
|
||||
put that reference with:
|
||||
|
||||
put_inotify_watch(watchp);
|
||||
|
||||
Call inotify_find_update_watch() to update the event mask for an existing watch.
|
||||
inotify_find_update_watch() returns the wd of the updated watch, or an error if
|
||||
the watch does not exist.
|
||||
|
||||
wd = inotify_find_update_watch(ih, inode, mask);
|
||||
|
||||
An existing watch may be removed by calling either inotify_rm_watch() or
|
||||
inotify_rm_wd().
|
||||
|
||||
int ret = inotify_rm_watch(ih, &my_watch->iwatch);
|
||||
int ret = inotify_rm_wd(ih, wd);
|
||||
|
||||
A watch may be removed while executing your event handler with the following:
|
||||
|
||||
inotify_remove_watch_locked(ih, iwatch);
|
||||
|
||||
Call inotify_destroy() to remove all watches from your inotify instance and
|
||||
release it. If there are no outstanding references, inotify_destroy() will call
|
||||
your destroy_watch op for each watch.
|
||||
|
||||
inotify_destroy(ih);
|
||||
|
||||
When inotify removes a watch, it sends an IN_IGNORED event to your callback.
|
||||
You may use this event as an indication to free the watch memory. Note that
|
||||
inotify may remove a watch due to filesystem events, as well as by your request.
|
||||
If you use IN_ONESHOT, inotify will remove the watch after the first event, at
|
||||
which point you may call the final inotify_put_watch.
|
||||
|
||||
(iv) Kernel Interface Prototypes
|
||||
|
||||
struct inotify_handle *inotify_init(struct inotify_operations *ops);
|
||||
|
||||
inotify_init_watch(struct inotify_watch *watch);
|
||||
|
||||
s32 inotify_add_watch(struct inotify_handle *ih,
|
||||
struct inotify_watch *watch,
|
||||
struct inode *inode, u32 mask);
|
||||
|
||||
s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
|
||||
struct inotify_watch **watchp);
|
||||
|
||||
s32 inotify_find_update_watch(struct inotify_handle *ih,
|
||||
struct inode *inode, u32 mask);
|
||||
|
||||
int inotify_rm_wd(struct inotify_handle *ih, u32 wd);
|
||||
|
||||
int inotify_rm_watch(struct inotify_handle *ih,
|
||||
struct inotify_watch *watch);
|
||||
|
||||
void inotify_remove_watch_locked(struct inotify_handle *ih,
|
||||
struct inotify_watch *watch);
|
||||
|
||||
void inotify_destroy(struct inotify_handle *ih);
|
||||
|
||||
void get_inotify_watch(struct inotify_watch *watch);
|
||||
void put_inotify_watch(struct inotify_watch *watch);
|
||||
|
||||
|
||||
(v) Internal Kernel Implementation
|
||||
|
||||
Each inotify instance is represented by an inotify_handle structure.
|
||||
Inotify's userspace consumers also have an inotify_device which is
|
||||
associated with the inotify_handle, and on which events are queued.
|
||||
|
||||
Each watch is associated with an inotify_watch structure. Watches are chained
|
||||
off of each associated device and each associated inode.
|
||||
off of each associated inotify_handle and each associated inode.
|
||||
|
||||
See fs/inotify.c for the locking and lifetime rules.
|
||||
See fs/inotify.c and fs/inotify_user.c for the locking and lifetime rules.
|
||||
|
||||
|
||||
(iv) Rationale
|
||||
(vi) Rationale
|
||||
|
||||
Q: What is the design decision behind not tying the watch to the open fd of
|
||||
the watched object?
|
||||
@@ -145,7 +263,7 @@ A: The poor user-space interface is the second biggest problem with dnotify.
|
||||
file descriptor-based one that allows basic file I/O and poll/select.
|
||||
Obtaining the fd and managing the watches could have been done either via a
|
||||
device file or a family of new system calls. We decided to implement a
|
||||
family of system calls because that is the preffered approach for new kernel
|
||||
family of system calls because that is the preferred approach for new kernel
|
||||
interfaces. The only real difference was whether we wanted to use open(2)
|
||||
and ioctl(2) or a couple of new system calls. System calls beat ioctls.
|
||||
|
||||
|
@@ -50,10 +50,11 @@ Turn your foo_read_super() into a function that would return 0 in case of
|
||||
success and negative number in case of error (-EINVAL unless you have more
|
||||
informative error value to report). Call it foo_fill_super(). Now declare
|
||||
|
||||
struct super_block foo_get_sb(struct file_system_type *fs_type,
|
||||
int flags, const char *dev_name, void *data)
|
||||
int foo_get_sb(struct file_system_type *fs_type,
|
||||
int flags, const char *dev_name, void *data, struct vfsmount *mnt)
|
||||
{
|
||||
return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
|
||||
return get_sb_bdev(fs_type, flags, dev_name, data, foo_fill_super,
|
||||
mnt);
|
||||
}
|
||||
|
||||
(or similar with s/bdev/nodev/ or s/bdev/single/, depending on the kind of
|
||||
|
@@ -70,11 +70,13 @@ tmpfs mounts. See Documentation/filesystems/tmpfs.txt for more information.
|
||||
What is rootfs?
|
||||
---------------
|
||||
|
||||
Rootfs is a special instance of ramfs, which is always present in 2.6 systems.
|
||||
(It's used internally as the starting and stopping point for searches of the
|
||||
kernel's doubly-linked list of mount points.)
|
||||
Rootfs is a special instance of ramfs (or tmpfs, if that's enabled), which is
|
||||
always present in 2.6 systems. You can't unmount rootfs for approximately the
|
||||
same reason you can't kill the init process; rather than having special code
|
||||
to check for and handle an empty list, it's smaller and simpler for the kernel
|
||||
to just make sure certain lists can't become empty.
|
||||
|
||||
Most systems just mount another filesystem over it and ignore it. The
|
||||
Most systems just mount another filesystem over rootfs and ignore it. The
|
||||
amount of space an empty instance of ramfs takes up is tiny.
|
||||
|
||||
What is initramfs?
|
||||
@@ -92,14 +94,16 @@ out of that.
|
||||
|
||||
All this differs from the old initrd in several ways:
|
||||
|
||||
- The old initrd was a separate file, while the initramfs archive is linked
|
||||
into the linux kernel image. (The directory linux-*/usr is devoted to
|
||||
generating this archive during the build.)
|
||||
- The old initrd was always a separate file, while the initramfs archive is
|
||||
linked into the linux kernel image. (The directory linux-*/usr is devoted
|
||||
to generating this archive during the build.)
|
||||
|
||||
- The old initrd file was a gzipped filesystem image (in some file format,
|
||||
such as ext2, that had to be built into the kernel), while the new
|
||||
such as ext2, that needed a driver built into the kernel), while the new
|
||||
initramfs archive is a gzipped cpio archive (like tar only simpler,
|
||||
see cpio(1) and Documentation/early-userspace/buffer-format.txt).
|
||||
see cpio(1) and Documentation/early-userspace/buffer-format.txt). The
|
||||
kernel's cpio extraction code is not only extremely small, it's also
|
||||
__init data that can be discarded during the boot process.
|
||||
|
||||
- The program run by the old initrd (which was called /initrd, not /init) did
|
||||
some setup and then returned to the kernel, while the init program from
|
||||
@@ -124,13 +128,14 @@ Populating initramfs:
|
||||
|
||||
The 2.6 kernel build process always creates a gzipped cpio format initramfs
|
||||
archive and links it into the resulting kernel binary. By default, this
|
||||
archive is empty (consuming 134 bytes on x86). The config option
|
||||
CONFIG_INITRAMFS_SOURCE (for some reason buried under devices->block devices
|
||||
in menuconfig, and living in usr/Kconfig) can be used to specify a source for
|
||||
the initramfs archive, which will automatically be incorporated into the
|
||||
resulting binary. This option can point to an existing gzipped cpio archive, a
|
||||
directory containing files to be archived, or a text file specification such
|
||||
as the following example:
|
||||
archive is empty (consuming 134 bytes on x86).
|
||||
|
||||
The config option CONFIG_INITRAMFS_SOURCE (for some reason buried under
|
||||
devices->block devices in menuconfig, and living in usr/Kconfig) can be used
|
||||
to specify a source for the initramfs archive, which will automatically be
|
||||
incorporated into the resulting binary. This option can point to an existing
|
||||
gzipped cpio archive, a directory containing files to be archived, or a text
|
||||
file specification such as the following example:
|
||||
|
||||
dir /dev 755 0 0
|
||||
nod /dev/console 644 0 0 c 5 1
|
||||
@@ -146,23 +151,84 @@ as the following example:
|
||||
Run "usr/gen_init_cpio" (after the kernel build) to get a usage message
|
||||
documenting the above file format.
|
||||
|
||||
One advantage of the text file is that root access is not required to
|
||||
One advantage of the configuration file is that root access is not required to
|
||||
set permissions or create device nodes in the new archive. (Note that those
|
||||
two example "file" entries expect to find files named "init.sh" and "busybox" in
|
||||
a directory called "initramfs", under the linux-2.6.* directory. See
|
||||
Documentation/early-userspace/README for more details.)
|
||||
|
||||
The kernel does not depend on external cpio tools, gen_init_cpio is created
|
||||
from usr/gen_init_cpio.c which is entirely self-contained, and the kernel's
|
||||
boot-time extractor is also (obviously) self-contained. However, if you _do_
|
||||
happen to have cpio installed, the following command line can extract the
|
||||
generated cpio image back into its component files:
|
||||
The kernel does not depend on external cpio tools. If you specify a
|
||||
directory instead of a configuration file, the kernel's build infrastructure
|
||||
creates a configuration file from that directory (usr/Makefile calls
|
||||
scripts/gen_initramfs_list.sh), and proceeds to package up that directory
|
||||
using the config file (by feeding it to usr/gen_init_cpio, which is created
|
||||
from usr/gen_init_cpio.c). The kernel's build-time cpio creation code is
|
||||
entirely self-contained, and the kernel's boot-time extractor is also
|
||||
(obviously) self-contained.
|
||||
|
||||
The one thing you might need external cpio utilities installed for is creating
|
||||
or extracting your own preprepared cpio files to feed to the kernel build
|
||||
(instead of a config file or directory).
|
||||
|
||||
The following command line can extract a cpio image (either by the above script
|
||||
or by the kernel build) back into its component files:
|
||||
|
||||
cpio -i -d -H newc -F initramfs_data.cpio --no-absolute-filenames
|
||||
|
||||
The following shell script can create a prebuilt cpio archive you can
|
||||
use in place of the above config file:
|
||||
|
||||
#!/bin/sh
|
||||
|
||||
# Copyright 2006 Rob Landley <rob@landley.net> and TimeSys Corporation.
|
||||
# Licensed under GPL version 2
|
||||
|
||||
if [ $# -ne 2 ]
|
||||
then
|
||||
echo "usage: mkinitramfs directory imagename.cpio.gz"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -d "$1" ]
|
||||
then
|
||||
echo "creating $2 from $1"
|
||||
(cd "$1"; find . | cpio -o -H newc | gzip) > "$2"
|
||||
else
|
||||
echo "First argument must be a directory"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
Note: The cpio man page contains some bad advice that will break your initramfs
|
||||
archive if you follow it. It says "A typical way to generate the list
|
||||
of filenames is with the find command; you should give find the -depth option
|
||||
to minimize problems with permissions on directories that are unwritable or not
|
||||
searchable." Don't do this when creating initramfs.cpio.gz images, it won't
|
||||
work. The Linux kernel cpio extractor won't create files in a directory that
|
||||
doesn't exist, so the directory entries must go before the files that go in
|
||||
those directories. The above script gets them in the right order.
|
||||
|
||||
External initramfs images:
|
||||
--------------------------
|
||||
|
||||
If the kernel has initrd support enabled, an external cpio.gz archive can also
|
||||
be passed into a 2.6 kernel in place of an initrd. In this case, the kernel
|
||||
will autodetect the type (initramfs, not initrd) and extract the external cpio
|
||||
archive into rootfs before trying to run /init.
|
||||
|
||||
This has the memory efficiency advantages of initramfs (no ramdisk block
|
||||
device) but the separate packaging of initrd (which is nice if you have
|
||||
non-GPL code you'd like to run from initramfs, without conflating it with
|
||||
the GPL licensed Linux kernel binary).
|
||||
|
||||
It can also be used to supplement the kernel's built-in initamfs image. The
|
||||
files in the external archive will overwrite any conflicting files in
|
||||
the built-in initramfs archive. Some distributors also prefer to customize
|
||||
a single kernel image with task-specific initramfs images, without recompiling.
|
||||
|
||||
Contents of initramfs:
|
||||
----------------------
|
||||
|
||||
An initramfs archive is a complete self-contained root filesystem for Linux.
|
||||
If you don't already understand what shared libraries, devices, and paths
|
||||
you need to get a minimal root filesystem up and running, here are some
|
||||
references:
|
||||
@@ -176,13 +242,36 @@ code against, along with some related utilities. It is BSD licensed.
|
||||
|
||||
I use uClibc (http://www.uclibc.org) and busybox (http://www.busybox.net)
|
||||
myself. These are LGPL and GPL, respectively. (A self-contained initramfs
|
||||
package is planned for the busybox 1.2 release.)
|
||||
package is planned for the busybox 1.3 release.)
|
||||
|
||||
In theory you could use glibc, but that's not well suited for small embedded
|
||||
uses like this. (A "hello world" program statically linked against glibc is
|
||||
over 400k. With uClibc it's 7k. Also note that glibc dlopens libnss to do
|
||||
name lookups, even when otherwise statically linked.)
|
||||
|
||||
A good first step is to get initramfs to run a statically linked "hello world"
|
||||
program as init, and test it under an emulator like qemu (www.qemu.org) or
|
||||
User Mode Linux, like so:
|
||||
|
||||
cat > hello.c << EOF
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
printf("Hello world!\n");
|
||||
sleep(999999999);
|
||||
}
|
||||
EOF
|
||||
gcc -static hello2.c -o init
|
||||
echo init | cpio -o -H newc | gzip > test.cpio.gz
|
||||
# Testing external initramfs using the initrd loading mechanism.
|
||||
qemu -kernel /boot/vmlinuz -initrd test.cpio.gz /dev/zero
|
||||
|
||||
When debugging a normal root filesystem, it's nice to be able to boot with
|
||||
"init=/bin/sh". The initramfs equivalent is "rdinit=/bin/sh", and it's
|
||||
just as useful.
|
||||
|
||||
Why cpio rather than tar?
|
||||
-------------------------
|
||||
|
||||
@@ -241,7 +330,7 @@ the above threads) is:
|
||||
Future directions:
|
||||
------------------
|
||||
|
||||
Today (2.6.14), initramfs is always compiled in, but not always used. The
|
||||
Today (2.6.16), initramfs is always compiled in, but not always used. The
|
||||
kernel falls back to legacy boot code that is reached only if initramfs does
|
||||
not contain an /init program. The fallback is legacy code, there to ensure a
|
||||
smooth transition and allowing early boot functionality to gradually move to
|
||||
@@ -258,8 +347,9 @@ and so on.
|
||||
|
||||
This kind of complexity (which inevitably includes policy) is rightly handled
|
||||
in userspace. Both klibc and busybox/uClibc are working on simple initramfs
|
||||
packages to drop into a kernel build, and when standard solutions are ready
|
||||
and widely deployed, the kernel's legacy early boot code will become obsolete
|
||||
and a candidate for the feature removal schedule.
|
||||
packages to drop into a kernel build.
|
||||
|
||||
But that's a while off yet.
|
||||
The klibc package has now been accepted into Andrew Morton's 2.6.17-mm tree.
|
||||
The kernel's current early boot code (partition detection, etc) will probably
|
||||
be migrated into a default initramfs, automatically created and used by the
|
||||
kernel build.
|
||||
|
@@ -113,8 +113,8 @@ members are defined:
|
||||
struct file_system_type {
|
||||
const char *name;
|
||||
int fs_flags;
|
||||
struct super_block *(*get_sb) (struct file_system_type *, int,
|
||||
const char *, void *);
|
||||
struct int (*get_sb) (struct file_system_type *, int,
|
||||
const char *, void *, struct vfsmount *);
|
||||
void (*kill_sb) (struct super_block *);
|
||||
struct module *owner;
|
||||
struct file_system_type * next;
|
||||
@@ -211,7 +211,7 @@ struct super_operations {
|
||||
int (*sync_fs)(struct super_block *sb, int wait);
|
||||
void (*write_super_lockfs) (struct super_block *);
|
||||
void (*unlockfs) (struct super_block *);
|
||||
int (*statfs) (struct super_block *, struct kstatfs *);
|
||||
int (*statfs) (struct dentry *, struct kstatfs *);
|
||||
int (*remount_fs) (struct super_block *, int *, char *);
|
||||
void (*clear_inode) (struct inode *);
|
||||
void (*umount_begin) (struct super_block *);
|
||||
|
Reference in New Issue
Block a user