Merge rsync://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

Conflicts: include/linux/kernel.h
2006-07-03 10:25:08 -04:00
parent af18ddb886 29454dde27
commit 0a1340c185
7737 changed files with 255567 additions and 164571 deletions
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -99,7 +99,7 @@ prototypes:
 	int (*sync_fs)(struct super_block *sb, int wait);
 	void (*write_super_lockfs) (struct super_block *);
 	void (*unlockfs) (struct super_block *);
-	int (*statfs) (struct super_block *, struct kstatfs *);
+	int (*statfs) (struct dentry *, struct kstatfs *);
 	int (*remount_fs) (struct super_block *, int *, char *);
 	void (*clear_inode) (struct inode *);
 	void (*umount_begin) (struct super_block *);
@@ -142,15 +142,16 @@ see also dquot_operations section.

 --------------------------- file_system_type ---------------------------
 prototypes:
-	struct super_block *(*get_sb) (struct file_system_type *, int,
-			const char *, void *);
+	struct int (*get_sb) (struct file_system_type *, int,
+			const char *, void *, struct vfsmount *);
 	void (*kill_sb) (struct super_block *);
 locking rules:
 		may block	BKL
 get_sb		yes		yes
 kill_sb		yes		yes

->get_sb() returns error or a locked superblock (exclusive on ->s_umount).
+->get_sb() returns error or 0 with locked superblock attached to the vfsmount
+(exclusive on ->s_umount).
 ->kill_sb() takes a write-locked superblock, does all shutdown work on it,
 unlocks and drops the reference.

--- a/Documentation/filesystems/automount-support.txt
+++ b/Documentation/filesystems/automount-support.txt
@@ -19,7 +19,7 @@ following procedure:

 (2) Have the follow_link() op do the following steps:

-     (a) Call do_kern_mount() to call the appropriate filesystem to set up a
+     (a) Call vfs_kern_mount() to call the appropriate filesystem to set up a
         superblock and gain a vfsmount structure representing it.

     (b) Copy the nameidata provided as an argument and substitute the dentry
--- a/Documentation/filesystems/configfs/configfs_example.c
+++ b/Documentation/filesystems/configfs/configfs_example.c
@@ -264,6 +264,15 @@ static struct config_item_type simple_child_type = {
 };


+struct simple_children {
+	struct config_group group;
+};
+
+static inline struct simple_children *to_simple_children(struct config_item *item)
+{
+	return item ? container_of(to_config_group(item), struct simple_children, group) : NULL;
+}
+
 static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
 {
 	struct simple_child *simple_child;
@@ -304,7 +313,13 @@ static ssize_t simple_children_attr_show(struct config_item *item,
 "items have only one attribute that is readable and writeable.\n");
 }

+static void simple_children_release(struct config_item *item)
+{
+	kfree(to_simple_children(item));
+}
+
 static struct configfs_item_operations simple_children_item_ops = {
+	.release 	= simple_children_release,
 	.show_attribute	= simple_children_attr_show,
 };

@@ -345,10 +360,6 @@ static struct configfs_subsystem simple_children_subsys = {
 * children of its own.
 */

-struct simple_children {
-	struct config_group group;
-};
-
 static struct config_group *group_children_make_group(struct config_group *group, const char *name)
 {
 	struct simple_children *simple_children;
--- a/Documentation/filesystems/devfs/ChangeLog
+++ b/Documentation/filesystems/devfs/ChangeLog
--- a/Documentation/filesystems/devfs/README
+++ b/Documentation/filesystems/devfs/README
--- a/Documentation/filesystems/devfs/ToDo
+++ b/Documentation/filesystems/devfs/ToDo
@@ -1,40 +0,0 @@
-		Device File System (devfs) ToDo List
-
-		Richard Gooch <rgooch@atnf.csiro.au>
-
-			      3-JUL-2000
-
-This is a list of things to be done for better devfs support in the
-Linux kernel. If you'd like to contribute to the devfs, please have a
-look at this list for anything that is unallocated. Also, if there are
-items missing (surely), please contact me so I can add them to the
-list (preferably with your name attached to them:-).
-
-
- >256 ptys
-  Thanks to C. Scott Ananian <cananian@alumni.princeton.edu>
-
- Amiga floppy driver (drivers/block/amiflop.c)
-
- Atari floppy driver (drivers/block/ataflop.c)
-
- SWIM3 (Super Woz Integrated Machine 3) floppy driver (drivers/block/swim3.c)
-
- Amiga ZorroII ramdisc driver (drivers/block/z2ram.c)
-
- Parallel port ATAPI CD-ROM (drivers/block/paride/pcd.c)
-
- Parallel port ATAPI floppy (drivers/block/paride/pf.c)
-
- AP1000 block driver (drivers/ap1000/ap.c, drivers/ap1000/ddv.c)
-
- Archimedes floppy (drivers/acorn/block/fd1772.c)
-
- MFM hard drive (drivers/acorn/block/mfmhd.c)
-
- I2O block device (drivers/message/i2o/i2o_block.c)
-
- ST-RAM device (arch/m68k/atari/stram.c)
-
- Raw devices
-
--- a/Documentation/filesystems/devfs/boot-options
+++ b/Documentation/filesystems/devfs/boot-options
@@ -1,65 +0,0 @@
-/* -*- auto-fill -*-                                                         */
-
-		Device File System (devfs) Boot Options
-
-		Richard Gooch <rgooch@atnf.csiro.au>
-
-			      18-AUG-2001
-
-
-When CONFIG_DEVFS_DEBUG is enabled, you can pass several boot options
-to the kernel to debug devfs. The boot options are prefixed by
-"devfs=", and are separated by commas. Spaces are not allowed. The
-syntax looks like this:
-
-devfs=<option1>,<option2>,<option3>
-
-and so on. For example, if you wanted to turn on debugging for module
-load requests and device registration, you would do:
-
-devfs=dmod,dreg
-
-You may prefix "no" to any option. This will invert the option.
-
-
-Debugging Options
-=================
-
-These requires CONFIG_DEVFS_DEBUG to be enabled.
-Note that all debugging options have 'd' as the first character. By
-default all options are off. All debugging output is sent to the
-kernel logs. The debugging options do not take effect until the devfs
-version message appears (just prior to the root filesystem being
-mounted).
-
-These are the options:
-
-dmod		print module load requests to <request_module>
-
-dreg		print device register requests to <devfs_register>
-
-dunreg		print device unregister requests to <devfs_unregister>
-
-dchange		print device change requests to <devfs_set_flags>
-
-dilookup	print inode lookup requests
-
-diget		print VFS inode allocations
-
-diunlink	print inode unlinks
-
-dichange	print inode changes
-
-dimknod		print calls to mknod(2)
-
-dall		some debugging turned on
-
-
-Other Options
-=============
-
-These control the default behaviour of devfs. The options are:
-
-mount		mount devfs onto /dev at boot time
-
-only		disable non-devfs device nodes for devfs-capable drivers
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -113,6 +113,14 @@ noquota
 grpquota
 usrquota

+bh		(*)	ext3 associates buffer heads to data pages to
+nobh			(a) cache disk block mapping information
+			(b) link pages into transaction to provide
+			    ordering guarantees.
+			"bh" option forces use of buffer heads.
+			"nobh" option tries to avoid associating buffer
+			heads (supported only for "writeback" mode).
+

 Specification
 =============
--- a/Documentation/filesystems/fuse.txt
+++ b/Documentation/filesystems/fuse.txt
@@ -18,6 +18,14 @@ Non-privileged mount (or user mount):
  user.  NOTE: this is not the same as mounts allowed with the "user"
  option in /etc/fstab, which is not discussed here.

+Filesystem connection:
+
+  A connection between the filesystem daemon and the kernel.  The
+  connection exists until either the daemon dies, or the filesystem is
+  umounted.  Note that detaching (or lazy umounting) the filesystem
+  does _not_ break the connection, in this case it will exist until
+  the last reference to the filesystem is released.
+
 Mount owner:

  The user who does the mounting.
@@ -86,16 +94,20 @@ Mount options
  The default is infinite.  Note that the size of read requests is
  limited anyway to 32 pages (which is 128kbyte on i386).

-Sysfs
-~~~~~
+Control filesystem
+~~~~~~~~~~~~~~~~~~

-FUSE sets up the following hierarchy in sysfs:
+There's a control filesystem for FUSE, which can be mounted by:

-  /sys/fs/fuse/connections/N/
+  mount -t fusectl none /sys/fs/fuse/connections

-where N is an increasing number allocated to each new connection.
+Mounting it under the '/sys/fs/fuse/connections' directory makes it
+backwards compatible with earlier versions.

-For each connection the following attributes are defined:
+Under the fuse control filesystem each connection has a directory
+named by a unique number.
+
+For each connection the following files exist within this directory:

 'waiting'

@@ -110,7 +122,47 @@ For each connection the following attributes are defined:
  connection.  This means that all waiting requests will be aborted an
  error returned for all aborted and new requests.

-Only a privileged user may read or write these attributes.
+Only the owner of the mount may read or write these files.
+
+Interrupting filesystem operations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If a process issuing a FUSE filesystem request is interrupted, the
+following will happen:
+
+  1) If the request is not yet sent to userspace AND the signal is
+     fatal (SIGKILL or unhandled fatal signal), then the request is
+     dequeued and returns immediately.
+
+  2) If the request is not yet sent to userspace AND the signal is not
+     fatal, then an 'interrupted' flag is set for the request.  When
+     the request has been successfully transfered to userspace and
+     this flag is set, an INTERRUPT request is queued.
+
+  3) If the request is already sent to userspace, then an INTERRUPT
+     request is queued.
+
+INTERRUPT requests take precedence over other requests, so the
+userspace filesystem will receive queued INTERRUPTs before any others.
+
+The userspace filesystem may ignore the INTERRUPT requests entirely,
+or may honor them by sending a reply to the _original_ request, with
+the error set to EINTR.
+
+It is also possible that there's a race between processing the
+original request and it's INTERRUPT request.  There are two possibilities:
+
+  1) The INTERRUPT request is processed before the original request is
+     processed
+
+  2) The INTERRUPT request is processed after the original request has
+     been answered
+
+If the filesystem cannot find the original request, it should wait for
+some timeout and/or a number of new requests to arrive, after which it
+should reply to the INTERRUPT request with an EAGAIN error.  In case
+1) the INTERRUPT request will be requeued.  In case 2) the INTERRUPT
+reply will be ignored.

 Aborting a filesystem connection
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -139,8 +191,8 @@ the filesystem.  There are several ways to do this:
  - Use forced umount (umount -f).  Works in all cases but only if
    filesystem is still attached (it hasn't been lazy unmounted)

-  - Abort filesystem through the sysfs interface.  Most powerful
-    method, always works.
+  - Abort filesystem through the FUSE control filesystem.  Most
+    powerful method, always works.

 How do non-privileged mounts work?
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -304,25 +356,7 @@ Scenario 1 -  Simple deadlock
 |                                    |     for "file"]
 |                                    |    *DEADLOCK*

-The solution for this is to allow requests to be interrupted while
-they are in userspace:
-
- |      [interrupted by signal]       |
- |    <fuse_unlink()                  |
- |    [release semaphore]             |    [semaphore acquired]
- |  <sys_unlink()                     |
- |                                    |    >fuse_unlink()
- |                                    |      [queue req on fc->pending]
- |                                    |      [wake up fc->waitq]
- |                                    |      [sleep on req->waitq]
-
-If the filesystem daemon was single threaded, this will stop here,
-since there's no other thread to dequeue and execute the request.
-In this case the solution is to kill the FUSE daemon as well.  If
-there are multiple serving threads, you just have to kill them as
-long as any remain.
-
-Moral: a filesystem which deadlocks, can soon find itself dead.
+The solution for this is to allow the filesystem to be aborted.

 Scenario 2 - Tricky deadlock
 ----------------------------
@@ -355,24 +389,14 @@ but is caused by a pagefault.
 |                                    |           [lock page]
 |                                    |           * DEADLOCK *

-Solution is again to let the the request be interrupted (not
-elaborated further).
+Solution is basically the same as above.

-An additional problem is that while the write buffer is being
-copied to the request, the request must not be interrupted.  This
-is because the destination address of the copy may not be valid
-after the request is interrupted.
+An additional problem is that while the write buffer is being copied
+to the request, the request must not be interrupted/aborted.  This is
+because the destination address of the copy may not be valid after the
+request has returned.

-This is solved with doing the copy atomically, and allowing
-interruption while the page(s) belonging to the write buffer are
-faulted with get_user_pages().  The 'req->locked' flag indicates
-when the copy is taking place, and interruption is delayed until
-this flag is unset.
-
-Scenario 3 - Tricky deadlock with asynchronous read
---------------------------------------------------
-
-The same situation as above, except thread-1 will wait on page lock
-and hence it will be uninterruptible as well.  The solution is to
-abort the connection with forced umount (if mount is attached) or
-through the abort attribute in sysfs.
+This is solved with doing the copy atomically, and allowing abort
+while the page(s) belonging to the write buffer are faulted with
+get_user_pages().  The 'req->locked' flag indicates when the copy is
+taking place, and abort is delayed until this flag is unset.
--- a/Documentation/filesystems/inotify.txt
+++ b/Documentation/filesystems/inotify.txt
@@ -69,17 +69,135 @@ Prototypes:
 	int inotify_rm_watch (int fd, __u32 mask);


-(iii) Internal Kernel Implementation
+(iii) Kernel Interface

-Each inotify instance is associated with an inotify_device structure.
+Inotify's kernel API consists a set of functions for managing watches and an
+event callback.
+
+To use the kernel API, you must first initialize an inotify instance with a set
+of inotify_operations.  You are given an opaque inotify_handle, which you use
+for any further calls to inotify.
+
+    struct inotify_handle *ih = inotify_init(my_event_handler);
+
+You must provide a function for processing events and a function for destroying
+the inotify watch.
+
+    void handle_event(struct inotify_watch *watch, u32 wd, u32 mask,
+    	              u32 cookie, const char *name, struct inode *inode)
+
+	watch - the pointer to the inotify_watch that triggered this call
+	wd - the watch descriptor
+	mask - describes the event that occurred
+	cookie - an identifier for synchronizing events
+	name - the dentry name for affected files in a directory-based event
+	inode - the affected inode in a directory-based event
+
+    void destroy_watch(struct inotify_watch *watch)
+
+You may add watches by providing a pre-allocated and initialized inotify_watch
+structure and specifying the inode to watch along with an inotify event mask.
+You must pin the inode during the call.  You will likely wish to embed the
+inotify_watch structure in a structure of your own which contains other
+information about the watch.  Once you add an inotify watch, it is immediately
+subject to removal depending on filesystem events.  You must grab a reference if
+you depend on the watch hanging around after the call.
+
+    inotify_init_watch(&my_watch->iwatch);
+    inotify_get_watch(&my_watch->iwatch);	// optional
+    s32 wd = inotify_add_watch(ih, &my_watch->iwatch, inode, mask);
+    inotify_put_watch(&my_watch->iwatch);	// optional
+
+You may use the watch descriptor (wd) or the address of the inotify_watch for
+other inotify operations.  You must not directly read or manipulate data in the
+inotify_watch.  Additionally, you must not call inotify_add_watch() more than
+once for a given inotify_watch structure, unless you have first called either
+inotify_rm_watch() or inotify_rm_wd().
+
+To determine if you have already registered a watch for a given inode, you may
+call inotify_find_watch(), which gives you both the wd and the watch pointer for
+the inotify_watch, or an error if the watch does not exist.
+
+    wd = inotify_find_watch(ih, inode, &watchp);
+
+You may use container_of() on the watch pointer to access your own data
+associated with a given watch.  When an existing watch is found,
+inotify_find_watch() bumps the refcount before releasing its locks.  You must
+put that reference with:
+
+    put_inotify_watch(watchp);
+
+Call inotify_find_update_watch() to update the event mask for an existing watch.
+inotify_find_update_watch() returns the wd of the updated watch, or an error if
+the watch does not exist.
+
+    wd = inotify_find_update_watch(ih, inode, mask);
+
+An existing watch may be removed by calling either inotify_rm_watch() or
+inotify_rm_wd().
+
+    int ret = inotify_rm_watch(ih, &my_watch->iwatch);
+    int ret = inotify_rm_wd(ih, wd);
+
+A watch may be removed while executing your event handler with the following:
+
+    inotify_remove_watch_locked(ih, iwatch);
+
+Call inotify_destroy() to remove all watches from your inotify instance and
+release it.  If there are no outstanding references, inotify_destroy() will call
+your destroy_watch op for each watch.
+
+    inotify_destroy(ih);
+
+When inotify removes a watch, it sends an IN_IGNORED event to your callback.
+You may use this event as an indication to free the watch memory.  Note that
+inotify may remove a watch due to filesystem events, as well as by your request.
+If you use IN_ONESHOT, inotify will remove the watch after the first event, at
+which point you may call the final inotify_put_watch.
+
+(iv) Kernel Interface Prototypes
+
+	struct inotify_handle *inotify_init(struct inotify_operations *ops);
+
+	inotify_init_watch(struct inotify_watch *watch);
+
+	s32 inotify_add_watch(struct inotify_handle *ih,
+		              struct inotify_watch *watch,
+			      struct inode *inode, u32 mask);
+
+	s32 inotify_find_watch(struct inotify_handle *ih, struct inode *inode,
+			       struct inotify_watch **watchp);
+
+	s32 inotify_find_update_watch(struct inotify_handle *ih,
+				      struct inode *inode, u32 mask);
+
+	int inotify_rm_wd(struct inotify_handle *ih, u32 wd);
+
+	int inotify_rm_watch(struct inotify_handle *ih,
+			     struct inotify_watch *watch);
+
+	void inotify_remove_watch_locked(struct inotify_handle *ih,
+					 struct inotify_watch *watch);
+
+	void inotify_destroy(struct inotify_handle *ih);
+
+	void get_inotify_watch(struct inotify_watch *watch);
+	void put_inotify_watch(struct inotify_watch *watch);
+
+
+(v) Internal Kernel Implementation
+
+Each inotify instance is represented by an inotify_handle structure.
+Inotify's userspace consumers also have an inotify_device which is
+associated with the inotify_handle, and on which events are queued.

 Each watch is associated with an inotify_watch structure.  Watches are chained
-off of each associated device and each associated inode.
+off of each associated inotify_handle and each associated inode.

-See fs/inotify.c for the locking and lifetime rules.
+See fs/inotify.c and fs/inotify_user.c for the locking and lifetime rules.


-(iv) Rationale
+(vi) Rationale

 Q: What is the design decision behind not tying the watch to the open fd of
   the watched object?
@@ -145,7 +263,7 @@ A: The poor user-space interface is the second biggest problem with dnotify.
   file descriptor-based one that allows basic file I/O and poll/select.
   Obtaining the fd and managing the watches could have been done either via a
   device file or a family of new system calls.  We decided to implement a
-   family of system calls because that is the preffered approach for new kernel
+   family of system calls because that is the preferred approach for new kernel
   interfaces.  The only real difference was whether we wanted to use open(2)
   and ioctl(2) or a couple of new system calls.  System calls beat ioctls.

--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -50,10 +50,11 @@ Turn your foo_read_super() into a function that would return 0 in case of
 success and negative number in case of error (-EINVAL unless you have more
 informative error value to report).  Call it foo_fill_super().  Now declare

-struct super_block foo_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+int foo_get_sb(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext2_fill_super);
+	return get_sb_bdev(fs_type, flags, dev_name, data, foo_fill_super,
+			   mnt);
 }

 (or similar with s/bdev/nodev/ or s/bdev/single/, depending on the kind of
--- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt
+++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
@@ -70,11 +70,13 @@ tmpfs mounts.  See Documentation/filesystems/tmpfs.txt for more information.
 What is rootfs?
 ---------------

-Rootfs is a special instance of ramfs, which is always present in 2.6 systems.
-(It's used internally as the starting and stopping point for searches of the
-kernel's doubly-linked list of mount points.)
+Rootfs is a special instance of ramfs (or tmpfs, if that's enabled), which is
+always present in 2.6 systems.  You can't unmount rootfs for approximately the
+same reason you can't kill the init process; rather than having special code
+to check for and handle an empty list, it's smaller and simpler for the kernel
+to just make sure certain lists can't become empty.

-Most systems just mount another filesystem over it and ignore it.  The
+Most systems just mount another filesystem over rootfs and ignore it.  The
 amount of space an empty instance of ramfs takes up is tiny.

 What is initramfs?
@@ -92,14 +94,16 @@ out of that.

 All this differs from the old initrd in several ways:

-  - The old initrd was a separate file, while the initramfs archive is linked
-    into the linux kernel image.  (The directory linux-*/usr is devoted to
-    generating this archive during the build.)
+  - The old initrd was always a separate file, while the initramfs archive is
+    linked into the linux kernel image.  (The directory linux-*/usr is devoted
+    to generating this archive during the build.)

  - The old initrd file was a gzipped filesystem image (in some file format,
-    such as ext2, that had to be built into the kernel), while the new
+    such as ext2, that needed a driver built into the kernel), while the new
    initramfs archive is a gzipped cpio archive (like tar only simpler,
-    see cpio(1) and Documentation/early-userspace/buffer-format.txt).
+    see cpio(1) and Documentation/early-userspace/buffer-format.txt).  The
+    kernel's cpio extraction code is not only extremely small, it's also
+    __init data that can be discarded during the boot process.

  - The program run by the old initrd (which was called /initrd, not /init) did
    some setup and then returned to the kernel, while the init program from
@@ -124,13 +128,14 @@ Populating initramfs:

 The 2.6 kernel build process always creates a gzipped cpio format initramfs
 archive and links it into the resulting kernel binary.  By default, this
-archive is empty (consuming 134 bytes on x86).  The config option
-CONFIG_INITRAMFS_SOURCE (for some reason buried under devices->block devices
-in menuconfig, and living in usr/Kconfig) can be used to specify a source for
-the initramfs archive, which will automatically be incorporated into the
-resulting binary.  This option can point to an existing gzipped cpio archive, a
-directory containing files to be archived, or a text file specification such
-as the following example:
+archive is empty (consuming 134 bytes on x86).
+
+The config option CONFIG_INITRAMFS_SOURCE (for some reason buried under
+devices->block devices in menuconfig, and living in usr/Kconfig) can be used
+to specify a source for the initramfs archive, which will automatically be
+incorporated into the resulting binary.  This option can point to an existing
+gzipped cpio archive, a directory containing files to be archived, or a text
+file specification such as the following example:

  dir /dev 755 0 0
  nod /dev/console 644 0 0 c 5 1
@@ -146,23 +151,84 @@ as the following example:
 Run "usr/gen_init_cpio" (after the kernel build) to get a usage message
 documenting the above file format.

-One advantage of the text file is that root access is not required to
+One advantage of the configuration file is that root access is not required to
 set permissions or create device nodes in the new archive.  (Note that those
 two example "file" entries expect to find files named "init.sh" and "busybox" in
 a directory called "initramfs", under the linux-2.6.* directory.  See
 Documentation/early-userspace/README for more details.)

-The kernel does not depend on external cpio tools, gen_init_cpio is created
-from usr/gen_init_cpio.c which is entirely self-contained, and the kernel's
-boot-time extractor is also (obviously) self-contained.  However, if you _do_
-happen to have cpio installed, the following command line can extract the
-generated cpio image back into its component files:
+The kernel does not depend on external cpio tools.  If you specify a
+directory instead of a configuration file, the kernel's build infrastructure
+creates a configuration file from that directory (usr/Makefile calls
+scripts/gen_initramfs_list.sh), and proceeds to package up that directory
+using the config file (by feeding it to usr/gen_init_cpio, which is created
+from usr/gen_init_cpio.c).  The kernel's build-time cpio creation code is
+entirely self-contained, and the kernel's boot-time extractor is also
+(obviously) self-contained.
+
+The one thing you might need external cpio utilities installed for is creating
+or extracting your own preprepared cpio files to feed to the kernel build
+(instead of a config file or directory).
+
+The following command line can extract a cpio image (either by the above script
+or by the kernel build) back into its component files:

  cpio -i -d -H newc -F initramfs_data.cpio --no-absolute-filenames

+The following shell script can create a prebuilt cpio archive you can
+use in place of the above config file:
+
+  #!/bin/sh
+
+  # Copyright 2006 Rob Landley <rob@landley.net> and TimeSys Corporation.
+  # Licensed under GPL version 2
+
+  if [ $# -ne 2 ]
+  then
+    echo "usage: mkinitramfs directory imagename.cpio.gz"
+    exit 1
+  fi
+
+  if [ -d "$1" ]
+  then
+    echo "creating $2 from $1"
+    (cd "$1"; find . | cpio -o -H newc | gzip) > "$2"
+  else
+    echo "First argument must be a directory"
+    exit 1
+  fi
+
+Note: The cpio man page contains some bad advice that will break your initramfs
+archive if you follow it.  It says "A typical way to generate the list
+of filenames is with the find command; you should give find the -depth option
+to minimize problems with permissions on directories that are unwritable or not
+searchable."  Don't do this when creating initramfs.cpio.gz images, it won't
+work.  The Linux kernel cpio extractor won't create files in a directory that
+doesn't exist, so the directory entries must go before the files that go in
+those directories.  The above script gets them in the right order.
+
+External initramfs images:
+--------------------------
+
+If the kernel has initrd support enabled, an external cpio.gz archive can also
+be passed into a 2.6 kernel in place of an initrd.  In this case, the kernel
+will autodetect the type (initramfs, not initrd) and extract the external cpio
+archive into rootfs before trying to run /init.
+
+This has the memory efficiency advantages of initramfs (no ramdisk block
+device) but the separate packaging of initrd (which is nice if you have
+non-GPL code you'd like to run from initramfs, without conflating it with
+the GPL licensed Linux kernel binary).
+
+It can also be used to supplement the kernel's built-in initamfs image.  The
+files in the external archive will overwrite any conflicting files in
+the built-in initramfs archive.  Some distributors also prefer to customize
+a single kernel image with task-specific initramfs images, without recompiling.
+
 Contents of initramfs:
 ----------------------

+An initramfs archive is a complete self-contained root filesystem for Linux.
 If you don't already understand what shared libraries, devices, and paths
 you need to get a minimal root filesystem up and running, here are some
 references:
@@ -176,13 +242,36 @@ code against, along with some related utilities.  It is BSD licensed.

 I use uClibc (http://www.uclibc.org) and busybox (http://www.busybox.net)
 myself.  These are LGPL and GPL, respectively.  (A self-contained initramfs
-package is planned for the busybox 1.2 release.)
+package is planned for the busybox 1.3 release.)

 In theory you could use glibc, but that's not well suited for small embedded
 uses like this.  (A "hello world" program statically linked against glibc is
 over 400k.  With uClibc it's 7k.  Also note that glibc dlopens libnss to do
 name lookups, even when otherwise statically linked.)

+A good first step is to get initramfs to run a statically linked "hello world"
+program as init, and test it under an emulator like qemu (www.qemu.org) or
+User Mode Linux, like so:
+
+  cat > hello.c << EOF
+  #include <stdio.h>
+  #include <unistd.h>
+
+  int main(int argc, char *argv[])
+  {
+    printf("Hello world!\n");
+    sleep(999999999);
+  }
+  EOF
+  gcc -static hello2.c -o init
+  echo init | cpio -o -H newc | gzip > test.cpio.gz
+  # Testing external initramfs using the initrd loading mechanism.
+  qemu -kernel /boot/vmlinuz -initrd test.cpio.gz /dev/zero
+
+When debugging a normal root filesystem, it's nice to be able to boot with
+"init=/bin/sh".  The initramfs equivalent is "rdinit=/bin/sh", and it's
+just as useful.
+
 Why cpio rather than tar?
 -------------------------

@@ -241,7 +330,7 @@ the above threads) is:
 Future directions:
 ------------------

-Today (2.6.14), initramfs is always compiled in, but not always used.  The
+Today (2.6.16), initramfs is always compiled in, but not always used.  The
 kernel falls back to legacy boot code that is reached only if initramfs does
 not contain an /init program.  The fallback is legacy code, there to ensure a
 smooth transition and allowing early boot functionality to gradually move to
@@ -258,8 +347,9 @@ and so on.

 This kind of complexity (which inevitably includes policy) is rightly handled
 in userspace.  Both klibc and busybox/uClibc are working on simple initramfs
-packages to drop into a kernel build, and when standard solutions are ready
-and widely deployed, the kernel's legacy early boot code will become obsolete
-and a candidate for the feature removal schedule.
+packages to drop into a kernel build.

-But that's a while off yet.
+The klibc package has now been accepted into Andrew Morton's 2.6.17-mm tree.
+The kernel's current early boot code (partition detection, etc) will probably
+be migrated into a default initramfs, automatically created and used by the
+kernel build.
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -113,8 +113,8 @@ members are defined:
 struct file_system_type {
 	const char *name;
 	int fs_flags;
-        struct super_block *(*get_sb) (struct file_system_type *, int,
-                                       const char *, void *);
+        struct int (*get_sb) (struct file_system_type *, int,
+                              const char *, void *, struct vfsmount *);
        void (*kill_sb) (struct super_block *);
        struct module *owner;
        struct file_system_type * next;
@@ -211,7 +211,7 @@ struct super_operations {
        int (*sync_fs)(struct super_block *sb, int wait);
        void (*write_super_lockfs) (struct super_block *);
        void (*unlockfs) (struct super_block *);
-        int (*statfs) (struct super_block *, struct kstatfs *);
+        int (*statfs) (struct dentry *, struct kstatfs *);
        int (*remount_fs) (struct super_block *, int *, char *);
        void (*clear_inode) (struct inode *);
        void (*umount_begin) (struct super_block *);