diff -rupN linux-kaio/Documentation/Configure.help linux/Documentation/Configure.help --- linux-kaio/Documentation/Configure.help Sun Aug 26 00:09:13 2001 +++ linux/Documentation/Configure.help Sun Aug 26 00:01:15 2001 @@ -2481,7 +2481,6 @@ CONFIG_HUB6 Say Y here to enable support in the dumb serial driver to support the HUB6 card. - PCMCIA serial device support CONFIG_PCMCIA_SERIAL_CS Say Y here to enable support for 16-bit PCMCIA serial devices, @@ -2495,7 +2494,6 @@ CONFIG_PCMCIA_SERIAL_CS a module, say M here and read Documentation/modules.txt. If unsure, say N. - Asynchronous I/O support CONFIG_AIO Enables support for Posix Asynchronous I/O Facilities. AIO is supported @@ -2503,7 +2501,6 @@ CONFIG_AIO (see "Support raw access to SCSI disk devices" under "SCSI support"). - /dev/agpgart (AGP Support) (EXPERIMENTAL) CONFIG_AGP AGP (Accelerated Graphics Port) is a bus system mainly used to @@ -11734,6 +11731,12 @@ CONFIG_REISERFS_CHECK effect on end users. If you are on the verge of sending in a bug report, say yes and you might get a useful error message. Almost everyone should say no. + +CONFIG_REISERFS_RAW + Setting this to yes will enable a set of ioctls that provide raw + interface to reiserfs tree, bypassing directories, and automatically + removing aged files. This is an experimental feature disigned for + squid cache directories. See Documentation/filesystems/reiserfs_raw.txt Second extended fs support CONFIG_EXT2_FS diff -rupN linux-kaio/Documentation/reiserfs-raw.txt linux/Documentation/reiserfs-raw.txt --- linux-kaio/Documentation/reiserfs-raw.txt Thu Jan 1 03:00:00 1970 +++ linux/Documentation/reiserfs-raw.txt Sat Aug 25 17:14:13 2001 @@ -0,0 +1,205 @@ + +Here is a quick intoduction to REISERFS-RAW. + +$Revision: 1.1.2.3 $ +$Date: 2000/11/08 05:46:45 $ + + +1. What it is? +~~~~~~~~~~~~~~ + +Reiserfs-raw is a "raw" interface to reiserfs. Its purpose is to gain +speed and ease of implementation for some applications, notably the +Squid Internet Object Cache (http://www.squid-cache.org). (The squid +version that uses reiserfs-raw can be found at squidng.sourceforge.net, +see also http://www.swelltech.com/pengies/joe/squidng.html and +ftp://ftp.botik.ru/pub/local/sizif/squid). + +In reiserfs-raw view, files on the disk are named by 64-bit numbers. +There are no directories, so you cannot structure the namespace. (For +now, you cannot even read the list of files on the disk.) On the +other hand, there is no directory overhead and there's less locking in +the kernel on file creation/deletion. + +Another feature of reiserfs-raw is its support for file aging and +removal. Reiserfs-raw keeps track of file age distrubution, and keeps +disk usage within specified bounds by removing the files older than +certain age when disk space becomes tight. This critical age is +termed "age threshold". + +We use a file removal technique that we call Passive Garbage +Collection (PGC). It is "passive" because it never scans the disk +looking for aged files; instead, it monitors filesystem meta-data that +filesystem code examines as part of its normal operation, and removes +the files that are both aged and convenient to remove at the moment. +The file age distribution is maintained in the form of "class LRU" table. +You can see its current state in /proc/fs/reiserfs/DEVICE/age_class. +See include/linux/age_class.h for further info. + +** WARNING: Reiserfs-raw is an experimental feature, not conforming to +** any standards. The description below simply describes the current +** implementation, not a proposed standard. Both implementation and +** interface may well be changed in the future. So if you use it, be +** prepared to rewrite your application code on upgrade. + +** SECURITY WARNING: Reiserfs_raw has working but unusual security +** model. Please see the section "Access permissions" below to make +** sure you understand the security model right and don't compromise +** security of your system by using reiserfs_raw improperly. + + + +2. Syscalls and data structures +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +/* For those who prefer studying examples, not specifications: you can + use the test program fs/reiserfs/utils/test-raw/raw.c as an example + application code. */ + +The interface is implemented as a set of ioctls. The file descriptor +that the ioctls are applied to should be obtained from open(2) of the +root firectory of the reiserfs-raw filesystem, like this: + + int dir = open ("/cache1", O_RDONLY); + +The following structure is used as argument for all reiserfs-raw +ioctls: + +struct reiserfs_raw_ioctl_arg { + __u32 id1; /* object id of the obj to operate on (low part) */ + __u32 id2; /* object id of the obj to operate on (high part) */ + __u32 lastref; /* last reference date (a la atime) */ + __u32 expires; /* expiration date */ + __u32 user1; /* user attribute, not used by the kernel */ + loff_t size; /* file size returned by RAWOPEN */ +}; + +The 'id1' and 'id2' together form the 64-bit numeric id (the filename +equivalent) to operate on. + +The 'lastref' field is used by PGC. It is similar in spirit to atime, +but is only affected by CREAT and OPEN, not by read and write. Also, +it is set not to current time but to some coarse-grained derivation of +it; this reduces the need for lastref updates when you frequently +access one file. + +The 'expires' field is set by the user to order file removal by PGC +after the current time gets beyond the expiration date. To expiration +is not desired, be careful set it to end-of-epoch (~0) in CREAT and +SETATTR operations. + +The 'user1' field is an opaque (for the kernel) attribute set in CREAT +and SETATTR and retreived by OPEN. Application may use it at will. + +The 'size' field is the file size. It is only valid upon return from +OPEN operation. We could get the size using fstat(2) on the file +descriptor returned by OPEN, so it's there only to save us the +overhead of an extra fstat(2) syscall. + + +Now to the syscalls. + +fd = ioctl (dir, REISERFS_IOC_RAWCREAT, &raw_ioctl_arg); + +Creates raw file with the id and attributes specified in +&raw_ioctl_arg, and returns descriptor of the file open for +read/write. Returns EEXIST if the file id1:id2 already exists. + +fd = ioctl (dir, REISERFS_IOC_RAWOPEN, &raw_ioctl_arg); + +Opens raw file with the id specified in &raw_ioctl_arg, and returns +descriptor of the file open for read/write. The attribute fields in +&raw_ioctl_arg are filled actual file's attributes. + +error = ioctl (dir, REISERFS_IOC_RAWUNLINK, &raw_ioctl_arg); + +Removes the file with the id specified in &raw_ioctl_arg. + +fd = ioctl (dir, REISERFS_IOC_RAWSETATTR, &raw_ioctl_arg); +fd = ioctl (fd, REISERFS_IOC_RAWSETATTR, &raw_ioctl_arg); + +Setts attributes of the raw file with the id specified in +raw_ioctl_arg, and fills raw_ioctl_arg with old file's attributes. + +This ioctl can as well be applied to file discriptor of an open raw +file, in which case that file's attribues are set, the id in +raw_ioctl_arg is not used. + +That's all new syscalls. You get file descriptor, and can apply +normal syscalls to them: read/write/sendfile/lseek, +fstat/fchown/fchmod, whatever. + + +3. Access permissions +~~~~~~~~~~~~~~~~~~~~~ + +The reiserfs-raw syscalls do not work unless you mount the +filesystem with -o raw (see the "Mount flags" section below). + +The permission to create or open files on a reiserfs-raw disk is +governed by the ownership and premission bits of the root directory of +the reiserfs-raw filesystem (O_WRITE is requered for CREAT, O_EXEC for +OPEN). Permission bits and ownership of the files proper are +_IGNORED_ (yes, IGNORED) by the reiserfs_raw interface, for speed. + +UNLINK permission is also governed by the mode of the root directory +of the raw filesystem. However, UNLINK operation fill refuse to +remove a non-raw file (a file that wasn't created using RAWCREAT), +because doing so would damage the filesystem structure. + +Security clarification: + +1) The simplified security model described in this section only +applies to filesystems mounted with -o raw. Filesystems mounted +without -o raw behave as usual, and are safe from this all. + +2) If you mount a filesystem with -o raw, and you have O_EXEC access +to the root directory of the filesystem, you can open *ANY* file on +this filesystem, regardless of access permissions of the file. So, to +protect the files on reiserfs_raw partition from users, put +restrictive mode on the root directory of the filesystem. Example: + + mkreiserfs /dev/sdb1 + mount -o raw /dev/sdb1 /cache + chmod 700 /cache + chown squid /cache + +3) Reiserfs_raw is secure if used right. Rule of thumb: don't have +any regular (named) files on raw partitions, don't mount regular +partitions with -o raw, chose ownership & permission of the root +directory of the raw filesystem properly, and you are safe. + + +4. Mount flags +~~~~~~~~~~~~~~ + +The raw reiserfs is not a valid filesystem from the viewpoint of +reiserfsck. Therefore, to avoid messing up "normal" reiserfs disks, +the reiserfs-raw interface will work with a disk only if you mount the +disk with -o raw. + +To specify the desired disk usage limit that PGC will try to maintain, +use the -o pgc=L,H mount option. L and H are lower and upper +thresholds in per cents, 0 < L <= H <= 100. PGC does nothing until +disk usage reaches L. When disk usage reaches L, PGC starts mildly +removing most aged files, and stops when disk usage falls below L. If +mild removal doesn't help and disk usage continues growing, PGC sets +threshold more and more aggressively as disk usage approaches H, the +goal being to keep disk usage below H at any price. + +Example mount command: + +mount -t reiserfs -onotail,noatime,raw,pgc=50,70 /dev/sdb1 /cache1 + + +5. Credits +~~~~~~~~~~ + +The original idea of the raw resierfs interface comes from Nikita +Danilov . PGC has been proposed by Yury +Shevchuk and prototyped by Nikita Danilov. The +complete code of reiserfs-raw+PGC has been written by Yury Shevchuk. + +The work has been done as a part of the ReiserFS project led by Hans +Reiser , under the sponsorship from Intergrated Linux +(http://www.integratedlinux.com). Thanks! diff -rupN linux-kaio/fs/Config.in linux/fs/Config.in --- linux-kaio/fs/Config.in Sat Jul 21 20:19:58 2001 +++ linux/fs/Config.in Sat Aug 25 17:16:00 2001 @@ -10,6 +10,7 @@ tristate 'Kernel automounter version 4 s dep_tristate 'Reiserfs support' CONFIG_REISERFS_FS $CONFIG_EXPERIMENTAL dep_mbool ' Have reiserfs do extra internal checking' CONFIG_REISERFS_CHECK $CONFIG_REISERFS_FS $CONFIG_EXPERIMENTAL +dep_mbool ' Enable reiserfs_raw interface' CONFIG_REISERFS_RAW $CONFIG_REISERFS_FS $CONFIG_EXPERIMENTAL dep_tristate 'ADFS file system support' CONFIG_ADFS_FS $CONFIG_EXPERIMENTAL dep_mbool ' ADFS write support (DANGEROUS)' CONFIG_ADFS_FS_RW $CONFIG_ADFS_FS $CONFIG_EXPERIMENTAL diff -rupN linux-kaio/fs/read_write.c linux/fs/read_write.c --- linux-kaio/fs/read_write.c Sun Aug 26 00:09:13 2001 +++ linux/fs/read_write.c Sat Aug 25 22:13:25 2001 @@ -449,6 +449,11 @@ asmlinkage int sys_aio(int cmd, { return(kaio_rw(cmd, (aiocb_t *)arg1, 0)); } + case AIOCMD_IOCTL: + case AIOCMD_CLOSE: + { + return(kaio_syscall(cmd, (aiocb_t *)arg1)); + } case AIOCMD_SUSPEND: { /* aio_suspend(aiocb_t *list[], nent, timeout) */ diff -rupN linux-kaio/fs/reiserfs/Makefile linux/fs/reiserfs/Makefile --- linux-kaio/fs/reiserfs/Makefile Mon Jan 15 22:42:32 2001 +++ linux/fs/reiserfs/Makefile Sat Aug 25 17:18:11 2001 @@ -11,6 +11,10 @@ O_TARGET := reiserfs.o obj-y := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o super.o prints.o objectid.o \ lbalance.o ibalance.o stree.o hashes.o buffer2.o tail_conversion.o journal.o resize.o tail_conversion.o version.o item_ops.o ioctl.o +ifdef CONFIG_REISERFS_RAW +obj-y += raw.o pgc.o age_class.o +endif + obj-m := $(O_TARGET) include $(TOPDIR)/Rules.make diff -rupN linux-kaio/fs/reiserfs/age_class.c linux/fs/reiserfs/age_class.c --- linux-kaio/fs/reiserfs/age_class.c Thu Jan 1 03:00:00 1970 +++ linux/fs/reiserfs/age_class.c Sat Aug 25 17:18:36 2001 @@ -0,0 +1,211 @@ +/* + * fs/reiserfs/age_class.c + * + * Class LRU routines for file aging and replacement. + * + * Written by Yury Shevchuk + * + * Changes: + * 20000826 Linear interpolation for threshold calculation + * proposed by Nikita Danilov + * + * Copyright 2000 Hans Reiser, see reiserfs/README for licensing and copyright details + * + * $Id: age_class.c,v 1.1.2.6 2000/09/08 14:42:45 sizif Exp $ + */ + +#include + +#define DUMMY(ARGS...) +#define debug(N,M) DUMMY + +#ifdef __KERNEL__ +# include +# include +# include +# define assert(X) BUG(); +# define snprintf(STR, N, FMT, ARGS...) sprintf (STR, FMT, ## ARGS) +# define SAFETY_MARGIN 100 +#else +# define SAFETY_MARGIN 0 /* not needed if snprintf is available */ +#endif + + +void +age_class_init (age_class *a, time_t curtime) +{ + int i = AGE_CLASS_COUNT; + while (--i >= 0) { + a->class[i].c = 0; + a->class[i].t = curtime; + } + a->total = 0; +} + + +/* count object with timestamp TIME in appropriate class (INC=1 for + arrival, INC=-1 for departure) */ + +void +age_class_update (age_class *a, time_t time, int inc) +{ + int n = age_class_find (a, time); + unsigned old = a->class[n].c; + unsigned new = a->class[n].c + inc; + + if ((old^new) & 0x80000000) { + /* Counter overflow can occur if you have too many (>> 4G) + objects -- use 64 bit counters then, while underflow means + an error somewhere (missing positive update) */ + debug(86, 1) + ("age_class_update: class[%u].c = %u + (%d) would overflow, refusing\n", + n, old, inc); + return; + } + + a->class[n].c = new; + a->total += inc; +} + + +/* find age class where timestamp TIME currently belongs */ + +int +age_class_find (age_class *a, time_t time) +{ + int i; + for (i = 0; i < AGE_CLASS_COUNT; i++) { + if (time >= a->class[i].t) + return i; + } + assert (0); + return 0; +} + + +/* The periodical shift procedure. Returns 0 if shift isn't done + because class0 isn't big enough */ + +int +age_class_shift (age_class *a, time_t curtime) +{ + unsigned long min; + int i, chosen; + + /* Don't shift unless class[0].c is big enough, and time interval + is wider than zero. */ + if (a->class[0].c <= a->total/AGE_CLASS_COUNT + || a->class[0].t >= curtime) + return 0; + + /* Consider all neighbouring class pairs to find the pair to + merge. We will merge the pair for which the sum of the + counters is minimal. */ + + min = ULONG_MAX; + chosen = 1; + + for (i = 1; i < AGE_CLASS_COUNT; i++) { + int sum = a->class[i-1].c + a->class[i].c; + if (sum < min) { + min = sum; + chosen = i; + } + } + + debug(86, 5) ("age_class_shift: shift up to class[%u]\n", chosen); + + i = chosen-1; + a->class[i+1].c += a->class[i].c; + a->class[i].t = a->class[i-1].t; + while (--i > 0) { + a->class[i+1].c = a->class[i].c; + a->class[i].t = a->class[i-1].t; + } + a->class[1].c = a->class[0].c; + a->class[0].c = 0; + a->class[0].t = curtime; + + return 1; +} + + +/* dump age_class for debugging purposes */ + +char * +age_class_dump (age_class *a, time_t curtime, char *buf, int bufsz) +{ + char *p = buf; + int room_left = bufsz - SAFETY_MARGIN; + time_t t = curtime; + int i; + + for (i = 0; i < AGE_CLASS_COUNT; i++) { + int n = snprintf (p, room_left, + " class %02d: files %-12u span %10ld..%-10ld = %-10ld\n", + i, a->class[i].c, a->class[i].t+1, t, t-a->class[i].t); + p += n; + if ((room_left -= n) <= 0) + break; + t = a->class[i].t; + } + if (room_left > 0) { + int n = snprintf (p, room_left, + " total files %-12u\n", a->total); + p += n; + room_left -= n; + } + return buf; +} + + +/* Compute age threshold based on desired file ratio. The ratio is + defined as M/N*100 where M is the count of files above threshold, N + is the total number of files. For example, for ratio=20 threshold + will be chosen so that every 5th file will be above the threshold. */ + +time_t +age_class_threshold (age_class *a, unsigned ratio, time_t curtime) +{ + unsigned long aged = 0; + unsigned long r1, r2 = 0; + time_t t1, t2; + int i = AGE_CLASS_COUNT; + + /* We should never be called if there are no files, + but let's play safe */ + if (! a->total) + return 0; + + t1 = t2 = r1 = 0; /* only to avoid incorrect warning */ + + while (--i > 0) { + aged += a->class[i].c; + if (aged == 0) + continue; + r1 = r2; + r2 = aged*100/a->total; + if (r2 >= ratio) { + t1 = a->class[i].t; + t2 = a->class[i-1].t; + break; + } + } + + if (i == 0) { + r1 = r2; + r2 = 100; + t1 = a->class[0].t; + t2 = curtime; + } + + /* Now the threshold is within class i, r1 <= ratio <= r2, (r1 != r2), + t1 <= age_threshold <= t2. Interpolate to find the threshold */ + + return t1 + (t2-t1)*(ratio - r1)/(r2-r1); +} + +/* + ;;; Local Variables: *** + ;;; tab-width:8 *** +*/ diff -rupN linux-kaio/fs/reiserfs/dir.c linux/fs/reiserfs/dir.c --- linux-kaio/fs/reiserfs/dir.c Sat Jul 21 20:20:56 2001 +++ linux/fs/reiserfs/dir.c Sat Aug 25 17:48:44 2001 @@ -28,6 +28,7 @@ struct file_operations reiserfs_dir_oper read: generic_read_dir, readdir: reiserfs_readdir, fsync: reiserfs_dir_fsync, + ioctl: reiserfs_dir_ioctl, }; /* diff -rupN linux-kaio/fs/reiserfs/inode.c linux/fs/reiserfs/inode.c --- linux-kaio/fs/reiserfs/inode.c Sat Jul 21 20:20:56 2001 +++ linux/fs/reiserfs/inode.c Sun Aug 26 02:44:30 2001 @@ -58,7 +58,7 @@ void reiserfs_delete_inode (struct inode unlock_kernel() ; } -static void _make_cpu_key (struct cpu_key * key, int version, __u32 dirid, __u32 objectid, +void _make_cpu_key (struct cpu_key * key, int version, __u32 dirid, __u32 objectid, loff_t offset, int type, int length) { key->version = version; @@ -923,6 +923,17 @@ static void init_inode (struct inode * i inode->i_generation = INODE_PKEY (inode)->k_dir_id; else inode->i_generation = le32_to_cpu( sd->u.sd_generation ); +#ifdef CONFIG_REISERFS_RAW + if (ih->ih_item_len == RAW_SD_SIZE) { + inode->u.reiserfs_i.lastref = le32_to_cpu (ATTR_LASTREF (sd)); + inode->u.reiserfs_i.expires = le32_to_cpu (ATTR_EXPIRES (sd)); + inode->u.reiserfs_i.user1 = le32_to_cpu (ATTR_USER1 (sd)); + } + else { + /* make raw/non-raw inodes clearly distinguishable */ + inode->u.reiserfs_i.lastref = 0; + } +#endif } /* nopack = 0, by default */ @@ -947,7 +958,7 @@ static void init_inode (struct inode * i // update new stat data with inode fields -static void inode2sd (void * sd, struct inode * inode) +void inode2sd (void * sd, struct inode * inode) { struct stat_data * sd_v2 = (struct stat_data *)sd; @@ -1010,7 +1021,15 @@ static void update_stat_data (struct pat // path points to old stat data inode2sd_v1 (B_I_PITEM (bh, ih), inode); } else { - inode2sd (B_I_PITEM (bh, ih), inode); + struct stat_data *sd = (struct stat_data *)B_I_PITEM (bh, ih); + inode2sd (sd, inode); +#ifdef CONFIG_REISERFS_RAW + if (ih->ih_item_len == RAW_SD_SIZE) { + ATTR_LASTREF(sd) = cpu_to_le32 (inode->u.reiserfs_i.lastref); + ATTR_EXPIRES(sd) = cpu_to_le32 (inode->u.reiserfs_i.expires); + ATTR_USER1(sd) = cpu_to_le32 (inode->u.reiserfs_i.user1); + } +#endif } return; @@ -1131,21 +1150,61 @@ void reiserfs_read_inode2 (struct inode } +/* + * For fs mounted in raw mode, we want 64bit inode numbers (in raw + * mode they serve as filenames, and we want to minimize collision + * probability). Hence the need for find_actor. For non-raw fs it + * is not used, so no harm to efficiency. + */ +static int +reiserfs_find_actor(struct inode *inode, unsigned long ino, void *opaque) +{ + struct reiserfs_iget4_args *args = opaque; + __u32 dir_id = args->objectid; + + return (dir_id == INODE_PKEY(inode)->k_dir_id); +} + struct inode * reiserfs_iget (struct super_block * s, struct cpu_key * key) { struct inode * inode; struct reiserfs_iget4_args args ; + find_inode_t find_actor = reiserfs_raw(s)? reiserfs_find_actor : NULL; args.objectid = key->on_disk_key.k_dir_id ; + inode = iget4 (s, key->on_disk_key.k_objectid, find_actor, &args); +#if 0 inode = iget4 (s, key->on_disk_key.k_objectid, 0, (void *)(&args)); if (!inode) return inode ; - + if (comp_short_keys (INODE_PKEY (inode), key) || is_bad_inode (inode)) { - /* either due to i/o error or a stale NFS handle */ - iput (inode); - inode = 0; + /* either due to i/o error or a stale NFS handle */ + iput (inode); + inode = 0; } +#endif + if (inode && is_bad_inode (inode)) { + if (reiserfs_raw (inode->i_sb)) + /* In RAW mode, query for nonexistent inode is part of + normal operation -- just like attempts to open + nonexistent file in normal mode. If we iput() the + resulting bad inode as is, the inode will stay in + i-cache, which results in cluttering i-cache with bad + inodes. To avoid that, we drop i_nlink so iput can + destroy the inode at once */ + inode->i_nlink = 0; + else + /* In normal mode, bad inodes are rare exception */ + reiserfs_warning ("vs-13048: reiserfs_iget: " + "bad_inode. Stat data of (%lu %lu) not found\n", + key->on_disk_key.k_dir_id, key->on_disk_key.k_objectid); + iput (inode); + inode = 0; + } + if (! inode) + inode = ERR_PTR (-ENOENT); + return inode; } diff -rupN linux-kaio/fs/reiserfs/ioctl.c linux/fs/reiserfs/ioctl.c --- linux-kaio/fs/reiserfs/ioctl.c Thu Mar 22 06:28:56 2001 +++ linux/fs/reiserfs/ioctl.c Sat Aug 25 20:21:43 2001 @@ -27,14 +27,73 @@ int reiserfs_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, unsigned long arg) { + struct reiserfs_raw_ioctl_arg ra; + struct reiserfs_raw_ioctl_arg *user_ra = (struct reiserfs_raw_ioctl_arg *)arg; + switch (cmd) { case REISERFS_IOC_UNPACK: if (arg) return reiserfs_unpack (inode, filp); - + return -ENOTTY; + +#ifdef CONFIG_REISERFS_RAW + case REISERFS_IOC_RAWSETATTR: + if (copy_from_user (&ra, user_ra, sizeof ra)) + return -EFAULT; + return reiserfs_rawsetattr_f (inode, &ra); +#endif /* CONFIG_REISERFS_RAW */ + default: return -ENOTTY; } +} + +int reiserfs_dir_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, + unsigned long arg) +{ + struct reiserfs_raw_ioctl_arg ra; + struct reiserfs_raw_ioctl_arg *user_ra = (struct reiserfs_raw_ioctl_arg *)arg; + int r; + + switch (cmd) { +#ifdef CONFIG_REISERFS_RAW + case REISERFS_IOC_RAWOPEN: + if (copy_from_user (&ra, user_ra, sizeof ra)) + return -EFAULT; + if ((r = reiserfs_rawopen (inode, filp->f_vfsmnt, &ra)) < 0) + return r; + if (copy_to_user (user_ra, &ra, sizeof ra)) + return -EFAULT; + return r; /* fd */ + + case REISERFS_IOC_RAWCREAT: + if (copy_from_user (&ra, user_ra, sizeof ra)) + return -EFAULT; + return reiserfs_rawcreat (inode, filp->f_vfsmnt, &ra); + + case REISERFS_IOC_RAWUNLINK: + if (copy_from_user (&ra, user_ra, sizeof ra)) + return -EFAULT; + return reiserfs_rawunlink (inode, &ra); + + case REISERFS_IOC_RAWSETATTR: + if (copy_from_user (&ra, user_ra, sizeof ra)) + return -EFAULT; + return reiserfs_rawsetattr (inode, &ra); + case REISERFS_IOC_TIMEDWAIT: + { + struct reiserfs_sigtimedwait_arg sa; + if( copy_from_user + ( &sa, + ( struct reiserfs_sigtimedwait_arg * ) arg, sizeof sa ) ) + return -EFAULT; + return reiserfs_sigtimedwait( inode, &sa ); + } +#endif /* CONFIG_REISERFS_RAW */ + + default: + return -EINVAL; + } } /* diff -rupN linux-kaio/fs/reiserfs/objectid.c linux/fs/reiserfs/objectid.c --- linux-kaio/fs/reiserfs/objectid.c Sat Apr 28 00:18:08 2001 +++ linux/fs/reiserfs/objectid.c Sat Aug 25 18:36:21 2001 @@ -170,9 +170,9 @@ void reiserfs_release_objectid (struct r } i += 2; } - - reiserfs_warning ("vs-15010: reiserfs_release_objectid: tried to free free object id (%lu)", - objectid_to_release); + if (! reiserfs_raw (s)) + reiserfs_warning ("vs-15010: reiserfs_release_objectid: tried to free free object id (%lu)", + objectid_to_release); } diff -rupN linux-kaio/fs/reiserfs/pgc.c linux/fs/reiserfs/pgc.c --- linux-kaio/fs/reiserfs/pgc.c Thu Jan 1 03:00:00 1970 +++ linux/fs/reiserfs/pgc.c Sat Aug 25 19:50:45 2001 @@ -0,0 +1,459 @@ +/* + * fs/reiserfs/pgc.c + * + * "Passive garbage collector" + * + * Keeps disk usage within the bounds specified with pgc=L,H mount option + * by unlinking aged files from the blocks that pass through search_by_key. + * We never do any active disk scan to find aged files, we just remove them + * as they pass by, thus avoiding extra disk activity. + + We might find it to be best to have all read-ahead reach memory + rather than staying in just the disk cache, so that PGC can occur + on it. This would involve turning off disk read-ahead and doing it + in the FS.. -Hans + + * + * Prototype by Nikita Danilov + * Rewritten for use with reiserfs_raw by Yury Shevchuk + * + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + * + * $Id: pgc.c,v 1.1.4.14 2000/12/22 08:52:04 sizif Exp $ */ + +#include +#include +#include +#include +#include + + +#define PGC_DEBUG_LEVEL 1 +#define pgc_debug(N, X) \ + do { if ((N) <= PGC_DEBUG_LEVEL) reiserfs_warning X; } while (0) + +/* how often to re-calculate PGC threshold (in seconds) */ +#define PGC_THRESHOLD_PERIOD 1 + +/* The size of the unlink queue. It should be big enough to contain + max number of unlinks we can reap from a single reiserfs node, so + we did't lose unlinks. Note that the q is processed on exit from + reiserfs_raw syscalls, and we should account for the possibility + of several concurrent processes filling it... We can make it + considerably big, as they are few (one unlinkq per fs mount.) */ +#define PGC_UNLINKQ_SZ (50 + PAGE_SIZE/RAW_SD_SIZE*50) + +/* Size of the rescan protection store */ +#define PGC_SCAN_RECENT_SZ (MAX_HEIGHT*6 + 2) + +struct pgc { + unsigned long lo; /* low water mark, blocks */ + unsigned long hi; /* high water mark, blocks */ + time_t age_threshold; /* remove files older than this */ + unsigned long age_threshold_last; /* last recalculation time (jiffies) */ + struct age_class age; /* age class table */ + struct key unlinkq[PGC_UNLINKQ_SZ]; /* keys queued for unlink */ + int unlinkq_count; /* unlinkq filled slot count */ + int scan_buffer_lock; /* flag to inhibit scan_buffer + while processing unlinkq */ + struct buffer_head *scan_recent[PGC_SCAN_RECENT_SZ]; + int scan_recent_idx; +}; + + + +//static read_proc_t pgc_proc_read_age_class; +static void pgc_shift_if_nec (struct reiserfs_transaction_handle *th); +static time_t pgc_recalc_threshold_if_nec (struct super_block *sb); +static void pgc_save_age_class (struct reiserfs_transaction_handle *th); +static void pgc_restore_age_class (struct super_block *sb); +static int pgc_push_unlinkq (struct super_block *sb, struct key *); + + + +int +pgc_init (struct super_block *sb) +{ + struct reiserfs_sb_info *rsi = &sb->u.reiserfs_sb; + + rsi->pgc = reiserfs_kmalloc (sizeof (struct pgc), GFP_KERNEL, sb); + if (! rsi->pgc) { + pgc_debug (1, ("pgc_init: no mem for PGC control data for %s\n", + bdevname (sb->s_dev))); + return -1; + } + rsi->pgc->unlinkq_count = 0; + rsi->pgc->scan_buffer_lock = 0; + rsi->pgc->scan_recent_idx = 0; + rsi->pgc->age_threshold = 0; + rsi->pgc->age_threshold_last = jiffies; + + pgc_restore_age_class (sb); + if (! rsi->pgc->age.total) + age_class_init (&rsi->pgc->age, CURRENT_TIME); + //reiserfs_proc_register (sb, "age_class", pgc_proc_read_age_class); + return 0; +} + +void +pgc_cleanup (struct super_block *sb, struct reiserfs_transaction_handle *th) +{ + struct reiserfs_sb_info *rsi = &sb->u.reiserfs_sb; + + //reiserfs_proc_unregister(sb, "age_class"); + pgc_save_age_class (th); + reiserfs_kfree (rsi->pgc, sizeof (struct pgc), sb); +} + + +/* Convers PGC thresholds from per cents to block counts and store + them in the pgc info structure */ + +void +pgc_set_thresholds (struct super_block * s, int lo, int hi) +{ + struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s); + struct pgc *pgc = s->u.reiserfs_sb.pgc; + + if (!pgc) + return; + + pgc->lo = le32_to_cpu (rs->s_block_count)/100 * lo; + pgc->hi = le32_to_cpu (rs->s_block_count)/100 * hi; +} + + + +/* The main function of PGC. Scan reiserfs formatted node looking for + aged stat data; if found, unlink the files */ + +void +pgc_scan_buffer (struct super_block *sb, struct buffer_head *bh) +{ + struct reiserfs_sb_info *rsi = &sb->u.reiserfs_sb; + struct pgc *pgc = rsi->pgc; + time_t age_threshold; + int i; + + if (! pgc) /* PGC disabled */ + return; + + /* not sure if we have the lock already. Get it just in case. */ + lock_kernel (); + + /* Do nothing if called from pgc_process_unlinkq: these buffers + are likely scanned and all unlinkables in them are queued already */ + if (++pgc->scan_buffer_lock > 1) + goto out; + + /* When reiserfs_get_block works, we get the same buffer + repeatedly. Remember the last buffer seen and don't rescan it. + More than that, remember several recently scanned buffers. Or + else we get substantial number of repetitions in unlinkq, which + result in needless reiserfs_iget for already deleted inodes in + pgc_process_unlinkq */ + i = PGC_SCAN_RECENT_SZ; + while (--i >= 0) { + if (bh == pgc->scan_recent[i]) + goto out; + } + if (--pgc->scan_recent_idx < 0) + pgc->scan_recent_idx = PGC_SCAN_RECENT_SZ-1; + pgc->scan_recent[pgc->scan_recent_idx] = bh; + + age_threshold = pgc_recalc_threshold_if_nec (sb); + if (! age_threshold) + goto out; + + for (i = 0; i < B_NR_ITEMS (bh); i++) { + struct item_head *ih = B_N_PITEM_HEAD (bh,i); + + if (ih_item_len (ih) == RAW_SD_SIZE + && ih_version (ih) == ITEM_VERSION_2 + && le16_to_cpu (ih->ih_key.u.k_offset_v2.k_type) == TYPE_STAT_DATA) { + + struct stat_data *sd = B_I_STAT_DATA (bh, ih); + + if (le32_to_cpu (ATTR_LASTREF (sd)) < age_threshold + || le32_to_cpu (ATTR_EXPIRES (sd)) < CURRENT_TIME) { + /* Aged file, remove. */ + pgc_debug (3, ("%s:%u: queuing unlink %k\n", + __FILE__, __LINE__, &ih->ih_key)); + /* However tempting it is to call reiserfs_rawunlink2 + right now, we will not. It will attempt to create + a transaction, while we are most likely inside an + open transaction already. If that open transaction + is not joinable, journal_begin will sleep to let it + finish, which means deadlock. So queue it and + remove later when we are for sure in user context. */ + if (pgc_push_unlinkq (sb, &ih->ih_key) < 0) + break; /* queue overflow */ + } + } + } + out: + pgc->scan_buffer_lock--; + unlock_kernel (); +} + + +static int +pgc_push_unlinkq (struct super_block *sb, struct key *kp) +{ + struct reiserfs_sb_info *rsi = &sb->u.reiserfs_sb; + struct pgc *pgc = rsi->pgc; + + if (pgc->unlinkq_count >= PGC_UNLINKQ_SZ) { + reiserfs_warning ("pgc_push_unlinkq: queue full (%u), dropping victim\n", + pgc->unlinkq_count); + return -1; + } + + pgc->unlinkq[pgc->unlinkq_count++] = *kp; + return 0; +} + +void +pgc_process_unlinkq (struct super_block *sb) +{ + struct reiserfs_sb_info *rsi = &sb->u.reiserfs_sb; + struct pgc *pgc = rsi->pgc; + time_t age_threshold; + unsigned long starttime; + + if (! pgc) /* PGC disabled */ + return; + if (! pgc->unlinkq_count) /* nothing queued for unlink */ + return; + if (pgc->scan_buffer_lock) /* another pgc_process_unlinkq is active */ + return; + + age_threshold = pgc_recalc_threshold_if_nec (sb); + if (! age_threshold) { + /* No need for space anymore? You're free, victims! */ + pgc->unlinkq_count = 0; + return; + } + + pgc->scan_buffer_lock++; + starttime = jiffies; + + /* Process queue in LIFO order, so we hopefully use recently + cached things in CPU cache. */ + + while (pgc->unlinkq_count) { + struct key *kp = &pgc->unlinkq[--pgc->unlinkq_count]; + struct cpu_key key; + struct inode *inode; + + /* this is an indirect measure, we + should code to more directly avoid + doing disk I/O, and this should + become a debugging check we never + see, once you have the time to code + it....:-) -Hans */ + if (jiffies - starttime > HZ/4) { + /* Looks like the queue grew so long that the least recent + keys got pushed from caches and unlinking them isn't + cheap (probably involves disk access). Purge the queue + and go pick fresh victims */ + pgc_debug (2, ("pgc_process_unlinkq: 0.25s spent in unlinkq\n")); + pgc->unlinkq_count = 0; + break; + } + + _make_cpu_key (&key, ITEM_VERSION_2, + le32_to_cpu (kp->k_dir_id), + le32_to_cpu (kp->k_objectid), + SD_OFFSET, TYPE_STAT_DATA, 3/*key length*/); + /* need an iget that refuses to read + disk if reading disk is necessary */ + inode = reiserfs_iget (sb, &key); + if (IS_ERR (inode)) { /* already deleted. Was in the queue twice? */ + static int i = 0; + i++; + if ((i & (i-1)) == 0) + reiserfs_warning ("pgc_process_unlinkq: %u overunlinks total\n", i); + continue; + } + /* check to make sure it didn't get accessed while sitting on + the unlinkq, since if it did we should not unlink it. */ + if (inode->u.reiserfs_i.lastref < age_threshold + || inode->u.reiserfs_i.expires < CURRENT_TIME) { + reiserfs_rawunlink2 (inode); + } + else { + /* someone referenced the file in the last second... + let it go free */ + iput (inode); + } + } + + pgc->scan_buffer_lock--; +} + + +/* Return the coarse-grained current time for use as lastref. */ + +time_t +pgc_current_time (struct super_block *sb) +{ + struct pgc *pgc = sb->u.reiserfs_sb.pgc; + + if (! pgc) + return CURRENT_TIME; + return pgc->age.class[0].t + 1; +} + + +/* Update age class counters, and schedule the update to disk in the + same transaction which will write updated stat data of the inode + that caused the update */ + +void +pgc_age_class_update (struct reiserfs_transaction_handle *th, + time_t old_lastref, + time_t new_lastref) +{ + struct super_block *sb = th->t_super; + struct pgc *pgc = sb->u.reiserfs_sb.pgc; + + if (! pgc) /* PGC disabled */ + return; + if (sb->s_flags & MS_RDONLY) + return; + + pgc_shift_if_nec (th); + if (old_lastref) age_class_update (&pgc->age, old_lastref, -1); + if (new_lastref) age_class_update (&pgc->age, new_lastref, +1); + pgc_save_age_class (th); +} + + +/* Periodic class counter shift procedure. First implementation + attempt used timers, but that implies the need for extensive + locking, so we'll better do this in user context. */ + +static void +pgc_shift_if_nec (struct reiserfs_transaction_handle *th) +{ + struct super_block *sb = th->t_super; + struct age_class *age = &sb->u.reiserfs_sb.pgc->age; + + if (age->class[0].t >= CURRENT_TIME) /* recently shifted */ + return; + + if (age_class_shift (age, CURRENT_TIME)) + pgc_save_age_class (th); +} + + +/* Periodic age threshold update procedure: calculates age_threshold + for GC based on current disk usage */ + +static time_t +pgc_recalc_threshold_if_nec (struct super_block *sb) +{ + struct reiserfs_sb_info *rsi = &sb->u.reiserfs_sb; + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK (sb); + struct pgc *pgc = rsi->pgc; + unsigned long du = (le32_to_cpu (rs->s_block_count) + - le32_to_cpu (rs->s_free_blocks)); + unsigned unlink_ratio; + + if ((jiffies - pgc->age_threshold_last) < PGC_THRESHOLD_PERIOD*HZ) + goto out; /* don't recalculate too often. */ + + pgc_debug (2, ("%s:%u: used=%lu pgclo=%lu pgchi=%lu j=%lu, last=%lu\n", + __FILE__, __LINE__, du, pgc->lo, pgc->hi, + jiffies, pgc->age_threshold_last)); + + if (du < pgc->lo) { + /* low disk usage, no need to clean anything yet */ + unlink_ratio = 0; + pgc->age_threshold = 0; + } + else { + /* When disk usage exceeds pgclo, we start removing; the + closer to pgchi, the more aggressive. At pgclo, we want to + remove 5 files of 100 on the average, at pgchi we want to + remove 95 files of 100. TODO: see if the numbers are + chosen well. Make them configurable via /proc? */ +#define LO_RATIO 5 +#define HI_RATIO 95 + unlink_ratio = (LO_RATIO + + ((du - pgc->lo) + * (HI_RATIO - LO_RATIO) + / (pgc->hi - pgc->lo + 1/*avoid div by 0*/))); + if (unlink_ratio > HI_RATIO) + unlink_ratio = HI_RATIO; + + pgc->age_threshold = + age_class_threshold (&pgc->age, unlink_ratio, CURRENT_TIME); + } + + pgc->age_threshold_last = jiffies; + + pgc_debug (2, ("%s:%u: unlink_ratio=%lu, set age_threshold=%lu\n", + __FILE__, __LINE__, + unlink_ratio, pgc->age_threshold)); + out: + return pgc->age_threshold; +} + + + +/* proc read routine for /proc/fs/reiserfs/XXX/age_class */ +/* +static int +pgc_proc_read_age_class (char *buffer, char **start, off_t offset, + int count, int *eof, void *data) +{ + struct super_block *sb = (void *) data; + struct reiserfs_sb_info *rsi = &sb->u.reiserfs_sb; + int len = 0; + + len += sprintf (&buffer[len], "Age threshold: %lu\n", + rsi->pgc->age_threshold); + + age_class_dump (&rsi->pgc->age, CURRENT_TIME, &buffer[len], PAGE_SIZE-len); + len = strlen (buffer); + + if (offset >= len) { + *start = buffer; + *eof = 1; + return 0; + } + *start = buffer + offset; + if ((len -= offset) > count) + return count; + *eof = 1; + return len; +} +*/ + +static void +pgc_save_age_class (struct reiserfs_transaction_handle *th) +{ + struct super_block *sb = th->t_super; + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK (sb); + struct age_class *age = &sb->u.reiserfs_sb.pgc->age; + struct buffer_head *sbh = SB_BUFFER_WITH_SB (sb); + + if (sb->s_flags & MS_RDONLY) + return; + reiserfs_prepare_for_journal (sb, sbh, 1) ; + /* update super block. TODO: should use cpu_to_le32 */ + memcpy (rs->s_unused, age, sizeof (struct age_class)); + journal_mark_dirty (th, sb, sbh); +} + +static void +pgc_restore_age_class (struct super_block *sb) +{ + struct reiserfs_super_block *rs = SB_DISK_SUPER_BLOCK (sb); + struct age_class *age = &sb->u.reiserfs_sb.pgc->age; + + /* TODO: should use le32_to_cpu */ + memcpy (age, rs->s_unused, sizeof (struct age_class)); +} diff -rupN linux-kaio/fs/reiserfs/raw.c linux/fs/reiserfs/raw.c --- linux-kaio/fs/reiserfs/raw.c Thu Jan 1 03:00:00 1970 +++ linux/fs/reiserfs/raw.c Sat Aug 25 17:18:42 2001 @@ -0,0 +1,639 @@ +/* + * fs/reiserfs/raw.c + * + * "Raw" reiserfs tree interface: access files by their objectid, + * bypassing directories. + * + * The primary target of this is Squid proxy cache acceleration, + * but it may have other uses, who knows. + * + * See Documentation/filesystems/reiserfs_raw.txt for more details. + * + * Idea from Nikita Danilov + * Written by Yury Shevchuk + * + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + * + * $Id: raw.c,v 1.1.2.16 2000/12/22 08:52:04 sizif Exp $ + */ + +#include +#include +#include +#include +#include + +static struct inode * reiserfs_raw_new_inode (struct reiserfs_transaction_handle *th, + struct inode *dir, + struct cpu_key *key, + struct inode *inode, + struct reiserfs_raw_ioctl_arg *ra, + int * err); +static void reiserfs_raw_id_correct (struct reiserfs_raw_ioctl_arg *ra); + + +/* Create raw file. Arguments are a directory inode (used only to + find superblock, so you can opendir() any directory on the fs), and + the user arguments structure (see reiserfs_fs.h) */ + +int +reiserfs_rawcreat (struct inode * dir, struct vfsmount *mnt, + struct reiserfs_raw_ioctl_arg *ra) +{ + struct super_block *sb = dir->i_sb; + struct file *filp; + struct inode *inode; + struct dentry *dentry; + struct cpu_key key; + struct reiserfs_transaction_handle th; + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2; + int windex; + int error; + int fd; + + if (! reiserfs_raw (sb)) + return -EPERM; + if (INODE_PKEY(dir)->k_objectid != REISERFS_ROOT_OBJECTID || + INODE_PKEY(dir)->k_dir_id != REISERFS_ROOT_PARENT_OBJECTID) + return -EPERM; + if ((error = permission (dir, MAY_WRITE))) + return error; + + fd = get_unused_fd(); + if (fd < 0) + return fd; + + /* + * Along the lines of reiserfs_create, but don't reiserfs_add_entry + */ + + inode = get_empty_inode() ; + if (!inode) { + error = -ENOMEM; + goto cleanup_fd; + } + + journal_begin (&th, sb, jbegin_count) ; + windex = push_journal_writer("rawcreat") ; + + /* make key for new stat data */ + reiserfs_raw_id_correct (ra); + _make_cpu_key (&key, ITEM_VERSION_2, + le32_to_cpu (ra->id1), + le32_to_cpu (ra->id2), + SD_OFFSET, TYPE_STAT_DATA, 3/*key length*/); + inode = reiserfs_raw_new_inode (&th, dir, &key, inode, ra, &error); + if (!inode) + goto cleanup_journal; /* reiserfs_new_inode iput()s on error */ + + inode->i_op = &reiserfs_file_inode_operations; + inode->i_fop = &reiserfs_file_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations ; + + /* Now we need to build the file structure. This is normally done + using filp_open(), but that would go through namei and require + a filename which we don't have. So we create a dummy dentry, + which is very conveniently done using d_alloc_root(). We don't + put the dentry in the dentry hash: it has no name and is + useless there. It will be freed on file close: fput -> __fput + -> dput */ + + dentry = d_alloc_root(inode); + if (! dentry) + goto cleanup_inode; + + filp = dentry_open(dentry, mntget(mnt), O_RDWR); + error = PTR_ERR(filp); + if (IS_ERR(filp)) + goto cleanup_inode; + + pop_journal_writer(windex) ; + journal_end(&th, sb, jbegin_count) ; + + pgc_process_unlinkq (sb); + fd_install(fd, filp); + return fd; + + cleanup_inode: + iput (inode); + cleanup_journal: + pop_journal_writer(windex) ; + journal_end(&th, sb, jbegin_count) ; + pgc_process_unlinkq (sb); + cleanup_fd: + put_unused_fd (fd); + return error; +} + +/* Open raw file. rawopen opens a file by key (not filename), and also + returns some of the file attributes (the ones squid wants). This + choice of attributes is squid specific optimization, and users are + encouraged to ask us to generalize this interface for other + applications. + + Arguments are a directory inode (must be the root directory of the + fs mounted with -o raw), and the user arguments structure (see + reiserfs_fs.h) */ + +int +reiserfs_rawopen (struct inode * dir, struct vfsmount *mnt, + struct reiserfs_raw_ioctl_arg *ra) +{ + int error; + struct cpu_key key; + struct file *filp; + struct inode *inode; + struct dentry *dentry; + struct super_block *sb = dir->i_sb; + int fd; + + if (! reiserfs_raw (sb)) + return -EPERM; + if (INODE_PKEY(dir)->k_objectid != REISERFS_ROOT_OBJECTID || + INODE_PKEY(dir)->k_dir_id != REISERFS_ROOT_PARENT_OBJECTID) + return -EPERM; + if ((error = permission (dir, MAY_EXEC))) + return error; + + fd = get_unused_fd(); + if (fd < 0) + return fd; + + reiserfs_raw_id_correct (ra); + _make_cpu_key (&key, ITEM_VERSION_2, + le32_to_cpu (ra->id1), + le32_to_cpu (ra->id2), + SD_OFFSET, TYPE_STAT_DATA, 3/*key length*/); + inode = reiserfs_iget (sb, &key); + if (IS_ERR (inode)) { + error = PTR_ERR (inode); + goto cleanup_fd; + } + if (!inode->i_nlink) { /* unlinked */ + /* reiserfs_read_inode2 should have caught such inode */ + reiserfs_warning ("sizif-174: iget fetched unlinked file?\n"); + goto cleanup_inode; + } + + dentry = d_alloc_root(inode); + if (! dentry) { + error = -ENOMEM; + goto cleanup_inode; + } + + filp = dentry_open(dentry, mntget(mnt), O_RDWR); + error = PTR_ERR(filp); + if (IS_ERR(filp)) + goto cleanup_inode; + + ra->lastref = inode->u.reiserfs_i.lastref; + ra->expires = inode->u.reiserfs_i.expires; + ra->user1 = inode->u.reiserfs_i.user1; + ra->size = inode->i_size; + + /* lastref update is relatively expensive. Don't do it if not + using PGC. */ + if (sb->u.reiserfs_sb.pgc) { + time_t _pgc_current_time = pgc_current_time (sb); + if (inode->u.reiserfs_i.lastref != _pgc_current_time) { + /* + * Write out stat data and age counters in one transaction + */ + int windex ; + struct reiserfs_transaction_handle th ; + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; + + journal_begin (&th, sb, jbegin_count) ; + windex = push_journal_writer("rawopen"); + inode->u.reiserfs_i.lastref = _pgc_current_time; + pgc_age_class_update (&th, ra->lastref, inode->u.reiserfs_i.lastref); + reiserfs_update_sd (&th, inode); + pop_journal_writer (windex) ; + journal_end (&th, sb, jbegin_count) ; + } + } + + pgc_process_unlinkq (sb); + fd_install(fd, filp); + return fd; + + cleanup_inode: + iput (inode); + cleanup_fd: + pgc_process_unlinkq (sb); + put_unused_fd (fd); + return error; +} + + +/* Set attributes for raw file. */ + +int +reiserfs_rawsetattr (struct inode * dir, struct reiserfs_raw_ioctl_arg *ra) +{ + struct super_block * sb = dir->i_sb; + struct cpu_key key; + struct inode *inode; + int ret; + + if (! reiserfs_raw (sb)) + return -EPERM; + if ((current->fsuid != dir->i_uid) && !capable(CAP_FOWNER)) + return -EACCES; + if (! ra->lastref) + return -EINVAL; /* 0 is special for pgc_age_class_update */ + if (INODE_PKEY(dir)->k_objectid != REISERFS_ROOT_OBJECTID || + INODE_PKEY(dir)->k_dir_id != REISERFS_ROOT_PARENT_OBJECTID) + return -EPERM; + + reiserfs_raw_id_correct (ra); + _make_cpu_key (&key, ITEM_VERSION_2, + le32_to_cpu (ra->id1), + le32_to_cpu (ra->id2), + SD_OFFSET, TYPE_STAT_DATA, 3/*key length*/); + inode = reiserfs_iget (sb, &key); + if (IS_ERR (inode)) + return PTR_ERR (inode); + + ret = reiserfs_rawsetattr2 (inode, ra); + + iput (inode); + return ret; +} + + +/* Set attributes for open raw file. */ + +int +reiserfs_rawsetattr_f (struct inode * inode, struct reiserfs_raw_ioctl_arg *ra) +{ + struct super_block * sb = inode->i_sb; + + if (! reiserfs_raw (sb)) + return -EPERM; + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EACCES; + if (! ra->lastref) + return -EINVAL; /* 0 is special for pgc_age_class_update */ + if (! inode->u.reiserfs_i.lastref) /* not a raw file */ + return -EPERM; + + return reiserfs_rawsetattr2 (inode, ra); +} + + +/* Common part of reiserfs_rawsetattr{,_f} */ + +int +reiserfs_rawsetattr2 (struct inode * inode, struct reiserfs_raw_ioctl_arg *ra) +{ + struct super_block * sb = inode->i_sb; + int dirty = 0; + int lastref_dirty = 0; + unsigned old_lastref = 0; + + if (inode->u.reiserfs_i.lastref != ra->lastref) { + old_lastref = inode->u.reiserfs_i.lastref; + inode->u.reiserfs_i.lastref = ra->lastref; + lastref_dirty = 1; + } + if (inode->u.reiserfs_i.expires != ra->expires) { + inode->u.reiserfs_i.expires = ra->expires; + dirty = 1; + } + if (inode->u.reiserfs_i.user1 != ra->user1) { + inode->u.reiserfs_i.user1 = ra->user1; + dirty = 1; + } + + if (lastref_dirty) { + /* + * Write out stat data and age counters in one transaction + */ + int windex ; + struct reiserfs_transaction_handle th ; + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; + + journal_begin (&th, sb, jbegin_count) ; + windex = push_journal_writer("rawsetattr"); + pgc_age_class_update (&th, old_lastref, inode->u.reiserfs_i.lastref); + reiserfs_update_sd (&th, inode); + pop_journal_writer (windex) ; + journal_end (&th, sb, jbegin_count) ; + } + else if (dirty) { + /* lastref unchanged -- no age class update. Somebody else + will take care of it. */ + mark_inode_dirty(inode); + } + + pgc_process_unlinkq (sb); + return 0; +} + + + +/* Unlink raw file. */ + +int +reiserfs_rawunlink (struct inode * dir, struct reiserfs_raw_ioctl_arg *ra) +{ + struct super_block *sb = dir->i_sb; + struct cpu_key key; + int error; + + if (! reiserfs_raw (sb)) + return -EPERM; + if (INODE_PKEY(dir)->k_objectid != REISERFS_ROOT_OBJECTID || + INODE_PKEY(dir)->k_dir_id != REISERFS_ROOT_PARENT_OBJECTID) + return -EPERM; + if ((error = permission (dir, MAY_WRITE))) + return error; + + reiserfs_raw_id_correct (ra); + _make_cpu_key (&key, ITEM_VERSION_2, + le32_to_cpu (ra->id1), + le32_to_cpu (ra->id2), + SD_OFFSET, TYPE_STAT_DATA, 3/*key length*/); + error = reiserfs_rawunlink2 (reiserfs_iget (sb, &key)); + + pgc_process_unlinkq (sb); + return error; +} + + +/* The bottom part of reiserfs_rawunlink, also used by pgc_process_unlinkq */ + +int +reiserfs_rawunlink2 (struct inode * inode) +{ + if (IS_ERR (inode)) + return PTR_ERR (inode); + if (!inode->u.reiserfs_i.lastref) { + /* Not a raw file. Unlinking it the way we do would leave + dangling directory entry, so disallow that */ + iput (inode); + return -EINVAL; + } + if (!inode->i_nlink) { + reiserfs_warning ("sizif-380: iget fetched unlinked file? (from %p)\n", + __builtin_return_address(0)); + goto iput_and_out; /* already deleted */ + } + inode->i_nlink--; + if (inode->i_nlink != 0) { + reiserfs_warning ("sizif-385: raw file with i_nlink=%d?\n", inode->i_nlink+1); + goto iput_and_out; + } + + /* Actually we need more work here. Suppose process P1 has a file + open. Another process (P2) tries to create a file with the + same id, gets EEXIST, and responds as squid/butterfly would by + issuing unlink() to get the file out of the way. This will + result in i_nlink going to zero, but inode will still be in + inode cache, and its stat data on the disk will be intact until + P1 closes the file. So if P2 retries its file creation attempt + right after the unlink(), it will still get EEXIST! This + weirdness is due to the absence of directories. One solution + might be to sleep in create() waiting for file to be removed, + but that's smelling like Windows... */ + + /* + * Write out stat data and age counters in one transaction + */ + { + int windex ; + struct reiserfs_transaction_handle th ; + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; + struct super_block *sb = inode->i_sb; + + journal_begin (&th, sb, jbegin_count) ; + windex = push_journal_writer("rawunlink2"); + pgc_age_class_update (&th, inode->u.reiserfs_i.lastref, 0); + reiserfs_update_sd (&th, inode); + pop_journal_writer (windex) ; + journal_end (&th, sb, jbegin_count) ; + } + + iput_and_out: + iput (inode); + return 0; +} + + +/* Our simplified reiserfs_new_inode (). The differences are: + * * we accept pre-generated SD key + * * we only create regular files, so skipping directory/symlink handling + * * we set inode attributes from supplied data + +This needs to be changed to use its own stat data item type with fewer +fields, but this is not a priority. There are hacks in here that will +become cleaner code if we use a new item type instead of engaging in +some pretense of using an older stat data item type. -Hans + + */ +static struct inode * +reiserfs_raw_new_inode (struct reiserfs_transaction_handle *th, + struct inode *dir, + struct cpu_key *key, + struct inode *inode, + struct reiserfs_raw_ioctl_arg *ra, + int * err) +{ + struct super_block *sb = dir->i_sb; + INITIALIZE_PATH (path_to_key); + char sdbuf[RAW_SD_SIZE]; + struct stat_data *sd = (void *)sdbuf; + struct item_head ih; + int retval; + + /* We only work with new format */ + if (old_format_only (sb)) { + /* "cannot happen": we should have caught this when mounting + in raw mode */ + reiserfs_warning ("sizif-453: 3.5 superblock?\n"); + *err = -EFAULT; + iput(inode) ; + return NULL; + } + + inode->i_sb = sb; + inode->i_flags = 0; + + /* item head of new item */ + ih.ih_key.k_dir_id = key->on_disk_key.k_dir_id; + ih.ih_key.k_objectid = key->on_disk_key.k_objectid; + make_le_item_head (&ih, 0, ITEM_VERSION_2, SD_OFFSET, TYPE_STAT_DATA, + RAW_SD_SIZE, MAX_US_INT); + + retval = search_item (sb, key, &path_to_key); + if (retval == IO_ERROR) { + iput (inode); + *err = -EIO; + return NULL; + } + if (retval == ITEM_FOUND) { + pathrelse (&path_to_key); + iput (inode); + *err = -EEXIST; + return NULL; + } + + /* fill stat data */ + inode->i_mode = S_IFREG|0600; + inode->i_nlink = 1; + inode->i_uid = current->fsuid; + inode->i_gid = (dir->i_mode & S_ISGID) ? dir->i_gid: current->fsgid; + + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_size = 0; + inode->i_blocks = 0; + inode->u.reiserfs_i.i_first_direct_byte = U32_MAX; + inode2sd (sd, inode); + + inode->u.reiserfs_i.lastref = pgc_current_time (sb); + inode->u.reiserfs_i.expires = ra->expires; + inode->u.reiserfs_i.user1 = ra->user1; + ATTR_LASTREF(sd) = cpu_to_le32 (inode->u.reiserfs_i.lastref); + ATTR_EXPIRES(sd) = cpu_to_le32 (ra->expires); + ATTR_USER1(sd) = cpu_to_le32 (ra->user1); + + // these do not go to on-disk stat data + inode->i_ino = le32_to_cpu (ih.ih_key.k_objectid); + inode->i_dev = sb->s_dev; + inode->i_blksize = sb->s_blocksize; + memcpy (INODE_PKEY (inode), &(ih.ih_key), KEY_SIZE); + inode_items_version (inode) = ITEM_VERSION_2; + + /* insert the stat data into the tree */ + retval = reiserfs_insert_item (th, &path_to_key, key, &ih, sdbuf); + if (retval) { + iput (inode); + *err = retval; + reiserfs_check_path(&path_to_key) ; + return NULL; + } + + pgc_age_class_update (th, 0, inode->u.reiserfs_i.lastref); + + /* TODO? inode->i_generation = INODE_PKEY (inode)->k_dir_id; */ + insert_inode_hash (inode); + reiserfs_check_path(&path_to_key) ; + return inode; +} + +/* Remap known bad ids to some other ids. Thus we avoid the "id + cannot be created even if you remove old file with this id" + situation, at the price of increasing collision probability for the + ids that we remap to. */ + +static void +reiserfs_raw_id_correct (struct reiserfs_raw_ioctl_arg *ra) +{ + if (ra->id1 == REISERFS_ROOT_PARENT_OBJECTID && + ra->id2 == REISERFS_ROOT_OBJECTID) { + ra->id1 += 12345; + } +#define SAFESAFE +#ifdef SAFESAFE + if (ra->id1 <= 1) ra->id1 += 2; + if (ra->id2 <= 1) ra->id2 += 2; +#else + /* bravely assuming [0,0], etc. are OK */ +#endif +} + + +/* The following is a good stuff but should REALLY be moved out of + here. It is unrelated to reiserfs. --sizif */ + +/* non-portable extension of sigtimedwait to get many siginfo's per + one call. Probably will improve (K)AIO performance. +*/ +int +reiserfs_sigtimedwait(struct inode * dir, struct reiserfs_sigtimedwait_arg *arg) +{ + int i; + int ret, sig; + sigset_t these; + struct timespec ts; + siginfo_t info; + long timeout = 0; + + if( arg -> uinfos <= 0 ) + { + return -EINVAL; + } + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if( arg -> sigsetsize != sizeof( sigset_t ) ) + return -EINVAL; + + if( copy_from_user( &these, arg -> uthese, sizeof( these ) ) ) + return -EFAULT; + + /* + * Invert the set of allowed signals to get those we + * want to block. + */ + sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP)); + signotset(&these); + + timeout = MAX_SCHEDULE_TIMEOUT; + if( arg -> uts ) { + if( copy_from_user( &ts, arg -> uts, sizeof( ts ) ) ) + return -EFAULT; + if( ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0 || ts.tv_sec < 0) + return -EINVAL; + } + + ret = 0; + for( i = 0 ; i < arg -> uinfos ; ++i ) + { + spin_lock_irq(¤t->sigmask_lock); + sig = dequeue_signal(&these, &info); + /* sleep, but only for the first time */ + if (!sig && !i) + { + /* None ready -- temporarily unblock those we're interested + in so that we'll be awakened when they arrive. */ + timeout = (timespec_to_jiffies(&ts) + (ts.tv_sec || ts.tv_nsec)); + if( timeout ) + { + sigset_t oldblocked = current->blocked; + sigandsets(¤t->blocked, ¤t->blocked, &these); + recalc_sigpending(current); + spin_unlock_irq(¤t->sigmask_lock); + + current->state = TASK_INTERRUPTIBLE; + timeout = schedule_timeout(timeout); + + spin_lock_irq(¤t->sigmask_lock); + sig = dequeue_signal(&these, &info); + recalc_sigpending(current); + current->blocked = oldblocked; + } + } + spin_unlock_irq(¤t->sigmask_lock); + + if (sig) { + ret = i + 1; + if( arg -> uinfo ) { + if (copy_to_user( &arg -> uinfo[ i ], &info, sizeof info)) + { + ret = -EFAULT; + break; + } + } + } else { + if( i == 0 ) + { + ret = -EAGAIN; + if (timeout) + ret = -EINTR; + } + break; + } + } + return ret; +} diff -rupN linux-kaio/fs/reiserfs/stree.c linux/fs/reiserfs/stree.c --- linux-kaio/fs/reiserfs/stree.c Sat Jul 21 20:20:56 2001 +++ linux/fs/reiserfs/stree.c Sat Aug 25 18:41:48 2001 @@ -734,6 +734,15 @@ int search_by_key (struct super_block * return IO_ERROR; } +#ifdef CONFIG_REISERFS_RAW + /* PGC: glance through the buffer looking for aged files' stat + data. If any found, we'll remove them at once. */ + if (p_s_sb->u.reiserfs_sb.pgc + && B_LEVEL (p_s_bh) == DISK_LEAF_NODE_LEVEL) { + pgc_scan_buffer (p_s_sb, p_s_bh); + } +#endif /* CONFIG_REISERFS_RAW */ + /* It is possible that schedule occurred. We must check whether the key to search is still in the tree rooted from the current buffer. If not then repeat search from the root. */ diff -rupN linux-kaio/fs/reiserfs/super.c linux/fs/reiserfs/super.c --- linux-kaio/fs/reiserfs/super.c Sat Jul 21 20:20:00 2001 +++ linux/fs/reiserfs/super.c Sun Aug 26 00:15:29 2001 @@ -112,7 +112,11 @@ void reiserfs_put_super (struct super_bl s->u.reiserfs_sb.s_rs->s_state = le16_to_cpu (s->u.reiserfs_sb.s_mount_state); journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); } - +#ifdef CONFIG_REISERFS_RAW + if (s->u.reiserfs_sb.pgc) + pgc_cleanup (s, (s->s_flags & MS_RDONLY)? NULL: &th); +#endif + /* note, journal_release checks for readonly mount, and can decide not ** to do a journal_end */ @@ -154,13 +158,21 @@ struct super_operations reiserfs_sops = }; +/* return values from parse_options */ +struct mount_options +{ + unsigned long blocks; /* arg of resize= */ + int pgclo; /* 1st arg of pgc=, low threshold */ + int pgchi; /* 2nd arg of pgc=, high threshold */ +}; + /* this was (ext2)parse_options */ -static int parse_options (char * options, unsigned long * mount_options, unsigned long * blocks) +static int parse_options (char * options, unsigned long * mount_options, struct mount_options * mop) { char * this_char; char * value; - *blocks = 0; + memset (mop, 0, sizeof (*mop)); if (!options) /* use default configuration: create tails, journaling on, no conversion to newest format */ @@ -193,7 +205,7 @@ static int parse_options (char * options set_bit (REPLAYONLY, mount_options); } else if (!strcmp (this_char, "resize")) { if (value && *value){ - *blocks = simple_strtoul (value, &value, 0); + mop->blocks = simple_strtoul (value, &value, 0); } else { printk("reiserfs: resize option requires a value\n"); return 0; @@ -222,6 +234,29 @@ static int parse_options (char * options printk("reiserfs: hash option requires a value\n"); return 0 ; } +#ifdef CONFIG_REISERFS_RAW + } else if (!strcmp (this_char, "raw")) { + set_bit (REISERFS_RAW, mount_options); + } else if (!strcmp (this_char, "pgc")) { + if (! value) { + pgc_usage: + printk("reiserfs: Usage: pgc=LOW,HIGH\n"); + return 0; + } + mop->pgclo = simple_strtoul (value, &value, 10); + if (*value != '\0') + goto pgc_usage; + value = strtok (NULL, ","); + if (!value) + goto pgc_usage; + mop->pgchi = simple_strtoul (value, &value, 10); + if (mop->pgclo <= 0 || mop->pgchi < mop->pgclo || mop->pgchi > 100) { + printk("reiserfs: pgc: bad parameters\n"); + return 0; + } + if (*value != '\0') + goto pgc_usage; +#endif /* CONFIG_REISERFS_RAW */ } else { printk ("reiserfs: Unrecognized mount option %s\n", this_char); return 0; @@ -247,20 +282,24 @@ int reiserfs_remount (struct super_block { struct reiserfs_super_block * rs; struct reiserfs_transaction_handle th ; - unsigned long blocks; + struct mount_options mo; unsigned long mount_options; rs = SB_DISK_SUPER_BLOCK (s); - if (!parse_options(data, &mount_options, &blocks)) + if (!parse_options(data, &mount_options, &mo)) return 0; - if(blocks) { - int rc = reiserfs_resize(s, blocks); + if(mo.blocks) { + int rc = reiserfs_resize(s, mo.blocks); if (rc != 0) return rc; } - +#if defined( CONFIG_REISERFS_RAW ) + if (mo.pgclo) + pgc_set_thresholds (s, mo.pgclo, mo.pgchi); +#endif + if ((unsigned long)(*flags & MS_RDONLY) == (s->s_flags & MS_RDONLY)) { /* there is nothing to do to remount read-only fs as read-only fs */ return 0; @@ -687,18 +726,18 @@ struct super_block * reiserfs_read_super extern int *blksize_size[]; struct reiserfs_transaction_handle th ; int old_format = 0; - unsigned long blocks; + struct mount_options mo; int jinit_done = 0 ; struct reiserfs_iget4_args args ; memset (&s->u.reiserfs_sb, 0, sizeof (struct reiserfs_sb_info)); - if (parse_options ((char *) data, &(s->u.reiserfs_sb.s_mount_opt), &blocks) == 0) { + if (parse_options ((char *) data, &(s->u.reiserfs_sb.s_mount_opt), &mo) == 0) { return NULL; } - if (blocks) { + if (mo.blocks) { printk("reserfs: resize option for remount only\n"); return NULL; } @@ -768,7 +807,11 @@ struct super_block * reiserfs_read_super iput(root_inode); goto error; } - + if (reiserfs_raw(s) && old_format_only(s)) { + printk ("-o raw cannot be used with reiserfs < 3.6\n"); + goto error; + } + // define and initialize hash function s->u.reiserfs_sb.s_hash_function = hash_function (s); if (s->u.reiserfs_sb.s_hash_function == NULL) { @@ -821,6 +864,14 @@ struct super_block * reiserfs_read_super init_waitqueue_head (&(s->u.reiserfs_sb.s_wait)); +#ifdef CONFIG_REISERFS_RAW + if (mo.pgclo) { + if (pgc_init (s) < 0) + goto error; + pgc_set_thresholds (s, mo.pgclo, mo.pgchi); + } +#endif + printk("%s\n", reiserfs_get_version_string()) ; return s; diff -rupN linux-kaio/include/linux/age_class.h linux/include/linux/age_class.h --- linux-kaio/include/linux/age_class.h Thu Jan 1 03:00:00 1970 +++ linux/include/linux/age_class.h Sun Aug 26 01:06:23 2001 @@ -0,0 +1,69 @@ +/* + * include/linux/age_class.h + * + * Class LRU routines for file aging and replacement. + * + * Written by Yury Shevchuk + * + * Copyright 2000 Hans Reiser, see reiserfs/README for licensing and copyright details + * + * $Id: age_class.h,v 1.1.2.5 2000/10/08 11:49:02 lim Exp $ + */ + +#ifndef __AGE_CLASS_H__ +#define __AGE_CLASS_H__ + +/* + * + * Class LRU routines + * + * + * Class LRU control structure is a small array of counters. Each + * counter counts files whose timestamp falls within the time interval + * associated with this counter: + * + * time interval file count + * Age class 0: curtime...t[0] c[0] + * Age class 1: t[0]...t[1] c[1] + * Age class 2: t[1]...t[2] c[2] + * Age class 3: t[2]...t[3] c[3] + * Age class 4: t[3]...t[4] c[4] + * ... + * + * In normal operation, c[0] is incremented with every file + * create/open. When the counter grows above 1/AGE_CLASS_COUNT of the + * total number of files, age_class_shift shifts age class boundaries + * leaving class 0 with c[0] == 0 and t[0] == curtime. The shift + * procedure tends to maintain more or less equal size classes. + * + * This compact counter structure provides a replacement for expensive + * true LRU list. By knowing the age distribution, we can compute + * reasonable age threshold and then scan the store (either actively or + * passively) removing all files older than that. */ + +#include + +#define AGE_CLASS_COUNT 15 /* was 32; lowered to make it fit + in zeroed s_unused in reiserfs + superblock */ + +typedef +struct age_class { + unsigned total; /* sum of class[*].c: total number of files in + this fs */ + struct { + unsigned c; /* count of files in this class */ + time_t t; /* low time boundary: files with timestamp>=t + but less than class[-1].t belong to this class */ + } class[AGE_CLASS_COUNT]; +} age_class; + +void age_class_init (age_class *a, time_t curtime); +void age_class_update (age_class *a, time_t time, int inc); +int age_class_find (age_class *a, time_t time); +int age_class_shift (age_class *a, time_t curtime); +char * age_class_dump (age_class *a, time_t curtime, char *buf, int bufsz); +time_t age_class_threshold (age_class *a, unsigned ratio, time_t curtime); + + +#endif /* __AGE_CLASS_H__ */ diff -rupN linux-kaio/include/linux/aio.h linux/include/linux/aio.h --- linux-kaio/include/linux/aio.h Sun Aug 26 00:09:14 2001 +++ linux/include/linux/aio.h Sun Aug 26 01:10:10 2001 @@ -35,11 +35,14 @@ #define AIO_THREAD_ENVIRON "AIO_NTHREADS" #define AIO_DEFAULT_THREADS 4 -#define AIO_MAX_THREADS 64 +/* #define AIO_MAX_THREADS 64 */ +#define AIO_MAX_THREADS (1024-100) #define AIO_TIMES 8 #define AIO_RESERVED 4 +#define KAIO_STATS + /* * User code can turn on 64-bit offsets in aiocb by: * #define _FILE_OFFSET_BITS 64 @@ -110,13 +113,24 @@ typedef struct aiocb64 { #define AIOCMD_WRITE 7 #define AIOCMD_GENERIC_READ 8 /* Reserved for internal use */ #define AIOCMD_LIST_IO 9 +#define AIOCMD_IOCTL 10 +#define AIOCMD_CLOSE 11 /* * For gathering timing statistics. */ +/* #define AIOTIME_REQUEST 0 #define AIOTIME_IO 1 #define AIOTIME_COPY 2 +*/ +#define AIOTIME_REQUEST 0 /* (userspace) before aio_* */ +#define AIOTIME_ENQUEUED 1 /* kaiocb created */ +#define AIOTIME_DEQUEUED 2 /* request dequeued by a slave */ +#define AIOTIME_IO 3 /* i/o finished */ +#define AIOTIME_COPY 4 /* copying to userspace finshed */ +#define AIOTIME_COMPLETE 6 /* in kaio_io_complete */ +#define AIOTIME_FINISH 7 /* (userspace) noticed op completion */ #if !defined(__USE_FILE_OFFSET64) && !defined(AIO_FILE_OFFSET64) /* @@ -131,6 +145,8 @@ int aio_suspend(const struct aiocb *cons int nent, const struct timespec *timeout); int lio_listio(int mode, struct aiocb *const list[], int nent, struct sigevent *sig); +int aio_ioctl(struct aiocb *aiocbp); +int aio_close(struct aiocb *aiocbp); #else # ifdef __REDIRECT extern int __REDIRECT (aio_read, __P ((struct aiocb *__aiocbp)), aio_read64); @@ -171,6 +187,8 @@ int aio_suspend64(const struct aiocb64 * int nent, const struct timespec *timeout); int lio_listio64(int mode, struct aiocb64 *const list[], int nent, struct sigevent *sig); +int aio_ioctl64(struct aiocb64 *aiocbp); +int aio_close64(struct aiocb64 *aiocbp); #endif /* @@ -256,6 +274,7 @@ typedef struct kaiocb { void *kaio_scp; /* cookie returned by raw I/O */ #endif + int kaio_fd; /* original fd, for AIOCMD_ */ #ifdef KAIO_STATS unsigned long kaio_times[AIO_TIMES]; @@ -281,6 +300,7 @@ typedef struct kaio_key { #define aio_return aio_reserved[3] extern int kaio_rw(int cmd, struct aiocb *, liocb_t *); +extern int kaio_syscall(int cmd, struct aiocb *); extern int kaio_list_io(int, struct aiocb *[], int, sigevent_t *); extern int kaio_suspend(aiocb_t *[], int, struct timespec *, int); extern int kaio_cancel(int, kaio_key_t *); diff -rupN linux-kaio/include/linux/reiserfs_fs.h linux/include/linux/reiserfs_fs.h --- linux-kaio/include/linux/reiserfs_fs.h Fri Aug 24 01:36:23 2001 +++ linux/include/linux/reiserfs_fs.h Sun Aug 26 01:20:40 2001 @@ -118,6 +118,7 @@ #define REISERFS_SUPER_MAGIC_STRING "ReIsErFs" #define REISER2FS_SUPER_MAGIC_STRING "ReIsEr2Fs" +#ifdef __KERNEL__ static inline int is_reiserfs_magic_string (struct reiserfs_super_block * rs) { return (!strncmp (rs->s_magic, REISERFS_SUPER_MAGIC_STRING, @@ -125,6 +126,7 @@ static inline int is_reiserfs_magic_stri !strncmp (rs->s_magic, REISER2FS_SUPER_MAGIC_STRING, strlen ( REISER2FS_SUPER_MAGIC_STRING))); } +#endif /* __KERNEL__ */ /* ReiserFS leaves the first 64k unused, so that partition labels have enough @@ -466,7 +468,7 @@ static inline __u32 type2uniqueness (int return V1_ANY_UNIQUENESS; } - +#ifdef __KERNEL__ // // key is pointer to on disk key which is stored in le, result is cpu, // there is no way to get version of object from key, so, provide @@ -530,7 +532,7 @@ static inline void set_le_ih_k_type (str #define is_indirect_le_ih(ih) is_indirect_le_key (ih_version(ih), &((ih)->ih_key)) #define is_statdata_le_ih(ih) is_statdata_le_key (ih_version (ih), &((ih)->ih_key)) - +#endif /* __KERNEL__ */ // // key is pointer to cpu key, result is cpu @@ -722,6 +724,8 @@ struct stat_data { based on sd_size and our tail suppression policy? */ } __attribute__ ((__packed__)) u; + int more[0]; /* reference point for extensions + like raw attributes below */ } __attribute__ ((__packed__)); // // this is 40 bytes long @@ -730,6 +734,25 @@ struct stat_data { #define stat_data_v1(ih) (ih_version (ih) == ITEM_VERSION_1) +/* + * Reiserfs-raw keeps its extra attributes in stat data. There is no + * place for them in struct stat_data, and I don't want to add + * additional fields to the disk structure -- normal files don't need + * these extra attributes, so we use the following trick. For raw + * files, we create enlargened SD item, and keep our attribute values + * just after stat data structure. Here are macros to access them. + * Remember they are in le32 layout. + */ +struct raw_attr { + __u32 lastref; + __u32 expires; + __u32 user1; +}; +#define ATTR_LASTREF(SD) (((struct raw_attr *)(SD)->more)->lastref) +#define ATTR_EXPIRES(SD) (((struct raw_attr *)(SD)->more)->expires) +#define ATTR_USER1(SD) (((struct raw_attr *)(SD)->more)->user1) +#define RAW_SD_SIZE (sizeof(struct stat_data) + \ + sizeof(struct raw_attr)) /***************************************************************************/ /* DIRECTORY STRUCTURE */ @@ -826,6 +849,7 @@ struct reiserfs_de_head #define de_visible(deh) test_bit_unaligned (DEH_Visible, &((deh)->deh_state)) #define de_hidden(deh) !test_bit_unaligned (DEH_Visible, &((deh)->deh_state)) +#ifdef __KERNEL__ /* compose directory item containing "." and ".." entries (entries are not aligned to 4 byte boundary) */ static inline void make_empty_dir_item_v1 (char * body, __u32 dirid, __u32 objid, @@ -932,7 +956,7 @@ static inline int entry_length (struct b // two entries per block (at least) #define REISERFS_MAX_NAME_LEN(block_size) 255 - +#endif /* __KERNEL__ */ /* this structure is used for operations on directory entries. It is @@ -1135,6 +1159,7 @@ struct path var = {ILLEGAL_PATH_ELEMENT_ ///#define TOO_LONG_LENGTH (~0ULL) +#ifdef __KERNEL__ // reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset #define U32_MAX (~(__u32)0) static inline loff_t max_reiserfs_offset (struct inode * inode) @@ -1144,7 +1169,7 @@ static inline loff_t max_reiserfs_offset return (loff_t)((~(__u64)0) >> 4); } - +#endif /* __KERNEL__ */ /*#define MAX_KEY_UNIQUENESS MAX_UL_INT*/ #define MAX_KEY_OBJECTID MAX_UL_INT @@ -1397,6 +1422,7 @@ struct buffer_info { +-------------------+------------+--------------+------------+ */ +#ifdef __KERNEL__ struct item_operations { int (*bytes_number) (struct item_head * ih, int block_size); void (*decrement_key) (struct cpu_key *); @@ -1430,7 +1456,7 @@ extern struct item_operations * item_ops #define op_unit_num(vi) item_ops[(vi)->vi_index]->unit_num (vi) #define op_print_vi(vi) item_ops[(vi)->vi_index]->print_vi (vi) - +#endif /* __KERNEL__ */ @@ -1544,9 +1570,10 @@ struct reiserfs_journal_header { __u32 j_first_unflushed_offset ; /* offset in the log of where to start replay after a crash */ __u32 j_mount_id ; } ; - +#ifdef __KERNEL__ extern task_queue reiserfs_commit_thread_tq ; extern wait_queue_head_t reiserfs_commit_thread_wait ; +#endif /* __KERNEL__ */ /* biggest tunable defines are right here */ #define JOURNAL_BLOCK_COUNT 8192 /* number of blocks in the journal */ @@ -1580,6 +1607,7 @@ extern wait_queue_head_t reiserfs_commit */ #define JOURNAL_BUFFER(j,n) ((j)->j_ap_blocks[((j)->j_start + (n)) % JOURNAL_BLOCK_COUNT]) +#ifdef __KERNEL__ void reiserfs_wait_on_write_block(struct super_block *s) ; void reiserfs_block_writes(struct reiserfs_transaction_handle *th) ; void reiserfs_allow_writes(struct super_block *s) ; @@ -1773,12 +1801,15 @@ int reiserfs_prepare_write(struct file * void reiserfs_truncate_file(struct inode *, int update_timestamps) ; void make_cpu_key (struct cpu_key * cpu_key, const struct inode * inode, loff_t offset, int type, int key_length); +void _make_cpu_key (struct cpu_key * key, int version, __u32 dirid, __u32 objectid, + loff_t offset, int type, int length); void make_le_item_head (struct item_head * ih, struct cpu_key * key, int version, loff_t offset, int type, int length, int entry_count); /*void store_key (struct key * key); void forget_key (struct key * key);*/ int reiserfs_get_block (struct inode * inode, long block, struct buffer_head * bh_result, int create); +void inode2sd (void * sd, struct inode * inode); struct inode * reiserfs_iget (struct super_block * s, struct cpu_key * key); void reiserfs_read_inode (struct inode * inode) ; void reiserfs_read_inode2(struct inode * inode, void *p) ; @@ -1944,6 +1975,47 @@ __u32 r5_hash (const char *msg, int len) /* version.c */ char *reiserfs_get_version_string(void) ; +#endif /* __KERNEL__ */ + +/* raw.c */ +struct reiserfs_raw_ioctl_arg { + __u32 id1; /* object id of the obj to operate on (low part) */ + __u32 id2; /* object id of the obj to operate on (high part) */ + __u32 lastref; /* last reference date (a la atime) */ + __u32 expires; /* expiration date */ + __u32 user1; /* user attribute, not used by the kernel */ + loff_t size; /* file size returned by RAWOPEN */ +}; + +struct reiserfs_sigtimedwait_arg { + sigset_t *uthese; + size_t sigsetsize; + siginfo_t *uinfo; + size_t uinfos; + struct timespec *uts; +}; + +#ifdef __KERNEL__ +int reiserfs_rawopen (struct inode *, struct vfsmount *, struct reiserfs_raw_ioctl_arg *); +int reiserfs_rawcreat (struct inode *, struct vfsmount *, struct reiserfs_raw_ioctl_arg *); +int reiserfs_rawsetattr (struct inode *, struct reiserfs_raw_ioctl_arg *); +int reiserfs_rawsetattr_f (struct inode *, struct reiserfs_raw_ioctl_arg *); +int reiserfs_rawsetattr2 (struct inode *, struct reiserfs_raw_ioctl_arg *); +int reiserfs_rawunlink (struct inode *, struct reiserfs_raw_ioctl_arg *); +int reiserfs_rawunlink2 (struct inode *); +int reiserfs_sigtimedwait(struct inode *, struct reiserfs_sigtimedwait_arg *); + +/* pgc.c */ +int pgc_init (struct super_block *sb); +void pgc_cleanup (struct super_block *, struct reiserfs_transaction_handle *); +void pgc_set_thresholds (struct super_block *, int, int); +void pgc_scan_buffer (struct super_block *sb, struct buffer_head *bh); +void pgc_age_class_update (struct reiserfs_transaction_handle *th, + time_t old_lastref, time_t new_lastref); +void pgc_process_unlinkq (struct super_block *sb); +time_t pgc_current_time (struct super_block *sb); + +#endif /*__KERNEL__*/ /* the ext2 bit routines adjust for big or little endian as ** appropriate for the arch, so in our laziness we use them rather @@ -2039,6 +2111,7 @@ found_middle: absolutely safe */ #define SPARE_SPACE 500 +#ifdef __KERNEL__ static inline unsigned long reiserfs_get_journal_block(struct super_block *s) { return le32_to_cpu(SB_DISK_SUPER_BLOCK(s)->s_journal_block) ; } @@ -2050,10 +2123,18 @@ static inline unsigned long reiserfs_get int reiserfs_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, unsigned long arg); int reiserfs_unpack (struct inode * inode, struct file * filp); - +int reiserfs_dir_ioctl (struct inode * inode, struct file * filp, + unsigned int cmd, unsigned long arg); +#endif /* __KERNEL__ */ + /* ioctl's command */ #define REISERFS_IOC_UNPACK _IOW(0xCD,1,long) - +#define REISERFS_IOC_RAWOPEN _IOWR(0xCD,15,sizeof(struct reiserfs_raw_ioctl_arg)) +#define REISERFS_IOC_RAWCREAT _IOW(0xCD,16,sizeof(struct reiserfs_raw_ioctl_arg)) +#define REISERFS_IOC_RAWSETATTR _IOW(0xCD,17,sizeof(struct reiserfs_raw_ioctl_arg)) +#define REISERFS_IOC_RAWUNLINK _IOW(0xCD,18,sizeof(struct reiserfs_raw_ioctl_arg)) +#define REISERFS_IOC_TIMEDWAIT _IOW(0xCD,19,sizeof(struct reiserfs_sigtimedwait_arg)) + #endif /* _LINUX_REISER_FS_H */ diff -rupN linux-kaio/include/linux/reiserfs_fs_i.h linux/include/linux/reiserfs_fs_i.h --- linux-kaio/include/linux/reiserfs_fs_i.h Sat Jul 21 20:20:58 2001 +++ linux/include/linux/reiserfs_fs_i.h Sat Aug 25 17:39:03 2001 @@ -40,6 +40,35 @@ struct reiserfs_inode_info { is a comment you should make.... -Hans */ //nopack-attribute int nopack; + + /* reiserfs_raw attributes */ + __u32 lastref; /* the time the file was last + referenced (created or open). This + is used for garbage collection, so + we don't need precise timestamping + -- all timestamps that fit in one + age class can be considered the + same. So we use coarse-grained + timestamp provided by + pgc_current_time() to avoid + excessive timestamp updates when + the file is accessed frequently. In + other words, we avoid updating + timestamps unless the update would + result in moving the file into + another age class. */ + __u32 expires; /* expiration date -- may be set by + application to request removal of + the file by PGC even if the lastref + isn't old enough to indicate it + should be targeted for removal + lastref-wise */ + __u32 user1; /* an opaque attribute for application + use: the kernel only stores and + retrieves it, but never looks at + its value. In the case of squid it + may someday hold the length of the + header. */ }; diff -rupN linux-kaio/include/linux/reiserfs_fs_sb.h linux/include/linux/reiserfs_fs_sb.h --- linux-kaio/include/linux/reiserfs_fs_sb.h Fri Aug 24 00:23:40 2001 +++ linux/include/linux/reiserfs_fs_sb.h Sun Aug 26 01:06:26 2001 @@ -6,6 +6,7 @@ #ifdef __KERNEL__ #include +#include #endif // @@ -302,6 +303,9 @@ struct reiserfs_sb_info int s_bmaps_without_search; int s_direct2indirect; int s_indirect2direct; +#ifdef CONFIG_REISERFS_RAW + struct pgc *pgc; +#endif }; @@ -327,6 +331,13 @@ struct reiserfs_sb_info #define FORCE_R5_HASH 8 /* try to force rupasov hash on mount */ #define FORCE_HASH_DETECT 9 /* try to detect hash function on mount */ +/* -o raw: enable ioctl (REISERFS_IOC_RAW*) which provides raw access + to internal reiserfs tree. This is designed for squid proxy cache + acceleration. To use this, recompile the kernel with + CONFIG_REISERFS_RAW enabled. */ + +#define REISERFS_RAW 10 + /* used for testing experimental features, makes benchmarking new features with and without more convenient, should never be used by @@ -355,7 +366,7 @@ struct reiserfs_sb_info #define replay_only(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REPLAYONLY)) #define reiserfs_dont_log(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_NOLOG)) #define old_format_only(s) ((SB_VERSION(s) != REISERFS_VERSION_2) && !((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_CONVERT))) - +#define reiserfs_raw(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_RAW)) void reiserfs_file_buffer (struct buffer_head * bh, int list); int reiserfs_is_super(struct super_block *s) ; diff -rupN linux-kaio/mm/filemap.c linux/mm/filemap.c --- linux-kaio/mm/filemap.c Sun Aug 26 00:09:14 2001 +++ linux/mm/filemap.c Sun Aug 26 23:03:03 2001 @@ -1272,6 +1272,15 @@ typedef struct { #define KAIO_INFO(task) ((kaio_queue_t *) (task)->kaio_info) +static inline unsigned long +kaio_time(void) +{ + struct timeval tv; + + do_gettimeofday(&tv); + return (tv.tv_sec * 1000000 + tv.tv_usec); +} + /* * AIO HASH TABLE: * A simple system wide table hashed on kq & aio_id. @@ -1421,7 +1430,6 @@ kaio_enqueue(kaio_queue_t *kq, kaiocb_t if (put_user(kaiocb->kaio_error, &kaiocb->kaio_uaiocb->aio_error)) return(-EFAULT); kaiocb->kaio_suspend_wait = 0; - kaio_set_time(kaiocb, AIOTIME_REQUEST); INIT_LIST_HEAD(&kaiocb->kaio_ioq); spin_lock(&kq->kaioq_lock); @@ -1435,7 +1443,9 @@ kaio_enqueue(kaio_queue_t *kq, kaiocb_t up(&kq->kaiosem); spin_unlock(&kq->kaioq_lock); - return 0; + kaio_set_time(kaiocb, AIOTIME_ENQUEUED); + + return 0; } /* @@ -1515,27 +1525,28 @@ kaio_init(void) kaio_init_hash_table(); } -static inline unsigned long -kaio_time(void) -{ - struct timeval tv; - - do_gettimeofday(&tv); - return (tv.tv_sec * 1000000 + tv.tv_usec); -} - static void send_signal(sigevent_t *sigev, struct task_struct *task) { if (sigev->sigev_notify == SIGEV_SIGNAL) { struct siginfo sinfo; + int r; memset(&sinfo, 0, sizeof(sinfo)); sinfo.si_signo = sigev->sigev_signo; sinfo.si_code = SI_ASYNCIO; sinfo.si_value = sigev->sigev_value; - send_sig_info(sigev->sigev_signo, &sinfo, task); + /* signal queue items are allocated at GFP_ATOMIC, so + -EAGAIN is possible. We don't want to lose + completion signals, really, so don't leave before + we succeed queuing the signal. */ + retry: + r = send_sig_info(sigev->sigev_signo, &sinfo, task); + if (r == -EAGAIN) { + schedule (); + goto retry; + } } } @@ -1578,8 +1589,9 @@ kaio_io_complete(kaiocb_t *kaiocb, int n } #ifdef KAIO_STATS - if (copy_to_user(&uaiocb->aio_times, &kaiocb->kaio_times, - sizeof(unsigned long) * AIO_TIMES)) + kaio_set_time(kaiocb, AIOTIME_COMPLETE); + if (copy_to_user(&uaiocb->aio_times[1], &kaiocb->kaio_times[1], + sizeof(unsigned long) * (AIO_TIMES-1))) { ret = -EFAULT; } else @@ -2016,9 +2028,10 @@ kaio_exit(void) int ret; \ (k)->kaio_filp->f_pos = (k)->kaio_offset; \ ret = (k)->kaio_filp->f_op-> ## op ((k)->kaio_filp, \ - (k)->kaio_buf, (k)->kaio_nbytes, &(k)->kaio_offset); \ + (k)->kaio_buf, (k)->kaio_nbytes, &(k)->kaio_filp->f_pos); \ kaio_io_complete((k), ret < 0 ? 0 : ret, ret > 0 ? 0 : ret, 0); \ } +/* (k)->kaio_buf, (k)->kaio_nbytes, &(k)->kaio_offset); \ */ static void kaio_set_slave_name(void) @@ -2030,6 +2043,30 @@ kaio_set_slave_name(void) current->mm->arg_start = current->mm->arg_end = 0; } +/* Called from kaio_slave to do AIOCMD_ requests */ +asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg); +static void +kaio_do_syscall(kaiocb_t *kaiocb) +{ + int ret; + + switch (kaiocb->kaio_cmd) { + case AIOCMD_IOCTL: + ret = sys_ioctl(kaiocb->kaio_fd, kaiocb->kaio_nbytes, + (unsigned long) kaiocb->kaio_buf); + break; + case AIOCMD_CLOSE: + ret = sys_close(kaiocb->kaio_fd); + break; + default: + ret = -EINVAL; + break; + } + + kaio_io_complete(kaiocb, ret<0? 0:ret, ret>0? 0:ret, 0); + return; +} + /* * kaio_slave: * Entry point for slave threads that service queued I/O requests. @@ -2058,6 +2095,10 @@ kaio_slave() kaio_set_slave_name(); kq = KAIO_INFO(current->p_pptr); /* service parent's kaio queue */ + if (! kq) { + unlock_kernel(); + return; + } KAIO_INFO(current) = 0; /* so kaio_exit does not trip */ kq->kaioqref--; if (kq->kaioq_nslaves == -1) @@ -2068,6 +2109,7 @@ kaio_slave() drain_io: while ((kaiocb = kaio_dequeue(kq))) { /* blocks */ + kaio_set_time(kaiocb, AIOTIME_DEQUEUED); #if defined(CONFIG_RAW) if (kaio_is_raw(kaiocb->kaio_filp)) do_raw_wait(kaiocb); @@ -2081,11 +2123,34 @@ drain_io: case AIOCMD_READ: KAIO_DO_FOP(read, kaiocb); break; - case AIOCMD_WRITE: { - KAIO_DO_FOP(write, kaiocb); + case AIOCMD_WRITE: { + KAIO_DO_FOP(write, kaiocb); break; - } - default: + } + /* + case AIOCMD_WRITE: { + /* TODO: for O_APPEND files, write + requests must be done in the submission + order. Howto? --sizif * / + loff_t pos = kaiocb->kaio_offset; + int ret = kaiocb->kaio_filp->f_op->write + (kaiocb->kaio_filp, + kaiocb->kaio_buf, + kaiocb->kaio_nbytes, + &pos); + kaiocb->kaio_filp->f_pos = pos; + kaio_set_time(kaiocb, AIOTIME_IO); + kaio_io_complete(kaiocb, + ret<0? 0:ret, + ret>0? 0:ret, 0); + break; + } + */ + case AIOCMD_IOCTL: + case AIOCMD_CLOSE: + kaio_do_syscall(kaiocb); + break; + default: kaio_io_complete(kaiocb, 0, -EINVAL, 0); } } @@ -2169,7 +2234,7 @@ retry: */ if (!page_cache) { spin_unlock(&pagecache_lock); - page_cache = page_cache_alloc(); + page_cache = page_cache_alloc(mapping); /* * That could have slept, so go around to the * very beginning.. @@ -2321,7 +2386,10 @@ kaio_rw(int cmd, aiocb_t *uaiocb, liocb_ kaiocb->kaio_cmd = cmd; kaiocb->kaio_liocb = liocb; kaiocb->kaio_pages = 0; - if (cmd == AIOCMD_READ && +#ifdef KAIO_STATS + memset(kaiocb->kaio_times, 0, sizeof (kaiocb->kaio_times)); +#endif + if (cmd == AIOCMD_READ && !access_ok(VERIFY_WRITE, aiocb.aio_buf, aiocb.aio_nbytes)) { ret = -EFAULT; @@ -2365,6 +2433,60 @@ rw_error: return ret; } +int +kaio_syscall(int cmd, aiocb_t *uaiocb) +{ + struct file *filp = NULL; + aiocb_t aiocb; + int ret = 0; + kaiocb_t *kaiocb = NULL; + + + if (copy_from_user(&aiocb, uaiocb, sizeof(aiocb_t))) + return -EFAULT; + if (!(filp = fget(aiocb.aio_fildes))) + return -EBADF; + + kaiocb = kmalloc(sizeof(kaiocb_t), GFP_KERNEL); + if (!kaiocb) { + ret = -EAGAIN; + goto error; + } + kaiocb->kaio_fd = aiocb.aio_fildes; + kaiocb->kaio_filp = filp; + kaiocb->kaio_buf = aiocb.aio_buf; + kaiocb->kaio_nbytes = aiocb.aio_nbytes; + kaiocb->kaio_offset = aiocb.aio_offset; + kaiocb->kaio_sigevent = aiocb.aio_sigevent; + kaiocb->kaio_task = current; + kaiocb->kaio_kq = KAIO_INFO(current); + kaiocb->kaio_uaiocb = uaiocb; + kaiocb->kaio_cmd = cmd; + kaiocb->kaio_liocb = NULL; + kaiocb->kaio_pages = 0; +#ifdef KAIO_STATS + memset(kaiocb->kaio_times, 0, sizeof (kaiocb->kaio_times)); +#endif + + ret = kaio_enqueue(KAIO_INFO(current), kaiocb); + if (!ret) { + kaio_key_t kkey; + + kkey.kaio_kid = kaiocb->kaio_id; + kkey.kaio_kq = current->kaio_info; + + if (copy_to_user(&uaiocb->aio_key, &kkey, sizeof(kkey))) + ret = -EFAULT; + } + error: + if (ret) { + if (filp) + fput(filp); + if (kaiocb) + kfree(kaiocb); + } + return ret; +} static void kaio_set_list_errors(aiocb_t *list[], int nent, int error)