diff -Nurd linux-2.6.24/Documentation/ABI/stable/sysfs-class-ubi ubifs-v2.6.24/Documentation/ABI/stable/sysfs-class-ubi --- linux-2.6.24/Documentation/ABI/stable/sysfs-class-ubi 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/Documentation/ABI/stable/sysfs-class-ubi 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,212 @@ +What: /sys/class/ubi/ +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + The ubi/ class sub-directory belongs to the UBI subsystem and + provides general UBI information, per-UBI device information + and per-UBI volume information. + +What: /sys/class/ubi/version +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + This file contains version of the latest supported UBI on-media + format. Currently it is 1, and there is no plan to change this. + However, if in the future UBI needs on-flash format changes + which cannot be done in a compatible manner, a new format + version will be added. So this is a mechanism for possible + future backward-compatible (but forward-incompatible) + improvements. + +What: /sys/class/ubiX/ +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + The /sys/class/ubi0, /sys/class/ubi1, etc directories describe + UBI devices (UBI device 0, 1, etc). They contain general UBI + device information and per UBI volume information (each UBI + device may have many UBI volumes) + +What: /sys/class/ubi/ubiX/avail_eraseblocks +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Amount of available logical eraseblock. For example, one may + create a new UBI volume which has this amount of logical + eraseblocks. + +What: /sys/class/ubi/ubiX/bad_peb_count +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Count of bad physical eraseblocks on the underlying MTD device. + +What: /sys/class/ubi/ubiX/bgt_enabled +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Contains ASCII "0\n" if the UBI background thread is disabled, + and ASCII "1\n" if it is enabled. + +What: /sys/class/ubi/ubiX/dev +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Major and minor numbers of the character device corresponding + to this UBI device (in : format). + +What: /sys/class/ubi/ubiX/eraseblock_size +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Maximum logical eraseblock size this UBI device may provide. UBI + volumes may have smaller logical eraseblock size because of their + alignment. + +What: /sys/class/ubi/ubiX/max_ec +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Maximum physical eraseblock erase counter value. + +What: /sys/class/ubi/ubiX/max_vol_count +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Maximum number of volumes which this UBI device may have. + +What: /sys/class/ubi/ubiX/min_io_size +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Minimum input/output unit size. All the I/O may only be done + in fractions of the contained number. + +What: /sys/class/ubi/ubiX/mtd_num +Date: January 2008 +KernelVersion: 2.6.25 +Contact: Artem Bityutskiy +Description: + Number of the underlying MTD device. + +What: /sys/class/ubi/ubiX/reserved_for_bad +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Number of physical eraseblocks reserved for bad block handling. + +What: /sys/class/ubi/ubiX/total_eraseblocks +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Total number of good (not marked as bad) physical eraseblocks on + the underlying MTD device. + +What: /sys/class/ubi/ubiX/volumes_count +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Count of volumes on this UBI device. + +What: /sys/class/ubi/ubiX/ubiX_Y/ +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + The /sys/class/ubi/ubiX/ubiX_0/, /sys/class/ubi/ubiX/ubiX_1/, + etc directories describe UBI volumes on UBI device X (volumes + 0, 1, etc). + +What: /sys/class/ubi/ubiX/ubiX_Y/alignment +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Volume alignment - the value the logical eraseblock size of + this volume has to be aligned on. For example, 2048 means that + logical eraseblock size is multiple of 2048. In other words, + volume logical eraseblock size is UBI device logical eraseblock + size aligned to the alignment value. + +What: /sys/class/ubi/ubiX/ubiX_Y/corrupted +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Contains ASCII "0\n" if the UBI volume is OK, and ASCII "1\n" + if it is corrupted (e.g., due to an interrupted volume update). + +What: /sys/class/ubi/ubiX/ubiX_Y/data_bytes +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + The amount of data this volume contains. This value makes sense + only for static volumes, and for dynamic volume it equivalent + to the total volume size in bytes. + +What: /sys/class/ubi/ubiX/ubiX_Y/dev +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Major and minor numbers of the character device corresponding + to this UBI volume (in : format). + +What: /sys/class/ubi/ubiX/ubiX_Y/name +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Volume name. + +What: /sys/class/ubi/ubiX/ubiX_Y/reserved_ebs +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Count of physical eraseblock reserved for this volume. + Equivalent to the volume size in logical eraseblocks. + +What: /sys/class/ubi/ubiX/ubiX_Y/type +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Volume type. Contains ASCII "dynamic\n" for dynamic volumes and + "static\n" for static volumes. + +What: /sys/class/ubi/ubiX/ubiX_Y/upd_marker +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Contains ASCII "0\n" if the update marker is not set for this + volume, and "1\n" if it is set. The update marker is set when + volume update starts, and cleaned when it ends. So the presence + of the update marker indicates that the volume is being updated + at the moment of the update was interrupted. The later may be + checked using the "corrupted" sysfs file. + +What: /sys/class/ubi/ubiX/ubiX_Y/usable_eb_size +Date: July 2006 +KernelVersion: 2.6.22 +Contact: Artem Bityutskiy +Description: + Logical eraseblock size of this volume. Equivalent to logical + eraseblock size of the device aligned on the volume alignment + value. diff -Nurd linux-2.6.24/Documentation/filesystems/ubifs.txt ubifs-v2.6.24/Documentation/filesystems/ubifs.txt --- linux-2.6.24/Documentation/filesystems/ubifs.txt 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/Documentation/filesystems/ubifs.txt 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,169 @@ +Introduction +============= + +UBIFS file-system stands for UBI File System. UBI stands for "Unsorted +Block Images". UBIFS is a flash file system, which means it is designed +to work with flash devices. It is important to understand, that UBIFS +is completely different to any traditional file-system in Linux, like +Ext2, XFS, JFS, etc. UBIFS represents a separate class of file-systems +which work with MTD devices, not block devices. The other Linux +file-system of this class is JFFS2. + +To make it more clear, here is a small comparison of MTD devices and +block devices. + +1 MTD devices represent flash devices and they consist of eraseblocks of + rather large size, typically about 128KiB. Block devices consist of + small blocks, typically 512 bytes. +2 MTD devices support 3 main operations - read from some offset within an + eraseblock, write to some offset within an eraseblock, and erase a whole + eraseblock. Block devices support 2 main operations - read a whole + block and write a whole block. +3 The whole eraseblock has to be erased before it becomes possible to + re-write its contents. Blocks may be just re-written. +4 Eraseblocks become worn out after some number of erase cycles - + typically 100K-1G for SLC NAND and NOR flashes, and 1K-10K for MLC + NAND flashes. Blocks do not have the wear-out property. +5 Eraseblocks may become bad (only on NAND flashes) and software should + deal with this. Blocks on hard drives typically do not become bad, + because hardware has mechanisms to substitute bad blocks, at least in + modern LBA disks. + +It should be quite obvious why UBIFS is very different to traditional +file-systems. + +UBIFS works on top of UBI. UBI is a separate software layer which may be +found in drivers/mtd/ubi. UBI is basically a volume management and +wear-leveling layer. It provides so called UBI volumes which is a higher +level abstraction than a MTD device. The programming model of UBI devices +is very similar to MTD devices - they still consist of large eraseblocks, +they have read/write/erase operations, but UBI devices are devoid of +limitations like wear and bad blocks (items 4 and 5 in the above list). + +In a sense, UBIFS is a next generation of JFFS2 file-system, but it is +very different and incompatible to JFFS2. The following are the main +differences. + +* JFFS2 works on top of MTD devices, UBIFS depends on UBI and works on + top of UBI volumes. +* JFFS2 does not have on-media index and has to build it while mounting, + which requires full media scan. UBIFS maintains the FS indexing + information on the flash media and does not require full media scan, + so it mounts many times faster than JFFS2. +* JFFS2 is a write-through file-system, while UBIFS supports write-back, + which makes UBIFS much faster on writes. + +Similarly to JFFS2, UBIFS supports on-the-flight compression which makes +it possible to fit quite a lot of data to the flash. + +Similarly to JFFS2, UBIFS is tolerant of unclean reboots and power-cuts. +It does not need stuff like fsck.ext2. UBIFS automatically replays its +journal and recovers from crashes, ensuring that the on-flash data +structures are consistent. + +UBIFS scales logarithmically (most of the data structures it uses are +trees), so the mount time and memory consumption do not linearly depend +on the flash size, like in case of JFFS2. This is because UBIFS +maintains the FS index on the flash media. However, UBIFS depends on +UBI, which scales linearly. So overall UBI/UBIFS stack scales linearly. +Nevertheless, UBI/UBIFS scales considerably better than JFFS2. + +The authors of UBIFS believe, that it is possible to develop UBI2 which +would scale logarithmically as well. UBI2 would support the same API as UBI, +but it would be binary incompatible to UBI. So UBIFS would not need to be +changed to use UBI2 + + +Mount options +============= + +(*) == default. + +bulk_read read more in one go to take advantage of flash + media that read faster sequentially +no_bulk_read (*) do not bulk-read +no_chk_data_crc skip checking of CRCs on data nodes in order to + improve read performance. Use this option only + if the flash media is highly reliable. The effect + of this option is that corruption of the contents + of a file can go unnoticed. +chk_data_crc (*) do not skip checking CRCs on data nodes +compr=none override default compressor and set it to "none" +compr=lzo override default compressor and set it to "lzo" +compr=zlib override default compressor and set it to "zlib" + + +Quick usage instructions +======================== + +The UBI volume to mount is specified using "ubiX_Y" or "ubiX:NAME" syntax, +where "X" is UBI device number, "Y" is UBI volume number, and "NAME" is +UBI volume name. + +Mount volume 0 on UBI device 0 to /mnt/ubifs: +$ mount -t ubifs ubi0_0 /mnt/ubifs + +Mount "rootfs" volume of UBI device 0 to /mnt/ubifs ("rootfs" is volume +name): +$ mount -t ubifs ubi0:rootfs /mnt/ubifs + +The following is an example of the kernel boot arguments to attach mtd0 +to UBI and mount volume "rootfs": +ubi.mtd=0 root=ubi0:rootfs rootfstype=ubifs + + +Module Parameters for Debugging +=============================== + +When UBIFS has been compiled with debugging enabled, there are 3 module +parameters that are available to control aspects of testing and debugging. +The parameters are unsigned integers where each bit controls an option. +The parameters are: + +debug_msgs Selects which debug messages to display, as follows: + + Message Type Flag value + + General messages 1 + Journal messages 2 + Mount messages 4 + Commit messages 8 + LEB search messages 16 + Budgeting messages 32 + Garbage collection messages 64 + Tree Node Cache (TNC) messages 128 + LEB properties (lprops) messages 256 + Input/output messages 512 + Log messages 1024 + Scan messages 2048 + Recovery messages 4096 + +debug_chks Selects extra checks that UBIFS can do while running: + + Check Flag value + + General checks 1 + Check Tree Node Cache (TNC) 2 + Check indexing tree size 4 + Check orphan area 8 + Check old indexing tree 16 + Check LEB properties (lprops) 32 + Check leaf nodes and inodes 64 + +debug_tsts Selects a mode of testing, as follows: + + Test mode Flag value + + Force in-the-gaps method 2 + Failure mode for recovery testing 4 + +For example, set debug_msgs to 5 to display General messages and Mount +messages. + + +References +========== + +UBIFS documentation and FAQ/HOWTO at the MTD web site: +http://www.linux-mtd.infradead.org/doc/ubifs.html +http://www.linux-mtd.infradead.org/faq/ubifs.html diff -Nurd linux-2.6.24/MAINTAINERS ubifs-v2.6.24/MAINTAINERS --- linux-2.6.24/MAINTAINERS 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/MAINTAINERS 2009-04-07 17:14:47.000000000 +0200 @@ -2148,6 +2148,15 @@ W: http://www.linux-mtd.infradead.org/doc/jffs2.html S: Maintained +UBI FILE SYSTEM (UBIFS) +P: Artem Bityutskiy +M: dedekind@infradead.org +P: Adrian Hunter +M: ext-adrian.hunter@nokia.com +L: linux-mtd@lists.infradead.org +W: http://www.linux-mtd.infradead.org/doc/ubifs.html +S: Maintained + JFS FILESYSTEM P: Dave Kleikamp M: shaggy@austin.ibm.com @@ -2547,7 +2556,7 @@ M: dedekind@infradead.org W: http://www.linux-mtd.infradead.org/ L: linux-mtd@lists.infradead.org -T: git git://git.infradead.org/~dedekind/ubi-2.6.git +T: git git://git.infradead.org/ubi-2.6.git S: Maintained MICROTEK X6 SCANNER diff -Nurd linux-2.6.24/crypto/Kconfig ubifs-v2.6.24/crypto/Kconfig --- linux-2.6.24/crypto/Kconfig 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/crypto/Kconfig 2009-04-07 17:14:47.000000000 +0200 @@ -502,6 +502,14 @@ Authenc: Combined mode wrapper for IPsec. This is required for IPSec. +config CRYPTO_LZO + tristate "LZO compression algorithm" + select CRYPTO_ALGAPI + select LZO_COMPRESS + select LZO_DECOMPRESS + help + This is the LZO algorithm. + source "drivers/crypto/Kconfig" endif # if CRYPTO diff -Nurd linux-2.6.24/crypto/Makefile ubifs-v2.6.24/crypto/Makefile --- linux-2.6.24/crypto/Makefile 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/crypto/Makefile 2009-04-07 17:14:47.000000000 +0200 @@ -51,6 +51,7 @@ obj-$(CONFIG_CRYPTO_DEFLATE) += deflate.o obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o +obj-$(CONFIG_CRYPTO_LZO) += lzo.o obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o diff -Nurd linux-2.6.24/crypto/lzo.c ubifs-v2.6.24/crypto/lzo.c --- linux-2.6.24/crypto/lzo.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/crypto/lzo.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,106 @@ +/* + * Cryptographic API. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include +#include +#include + +struct lzo_ctx { + void *lzo_comp_mem; +}; + +static int lzo_init(struct crypto_tfm *tfm) +{ + struct lzo_ctx *ctx = crypto_tfm_ctx(tfm); + + ctx->lzo_comp_mem = vmalloc(LZO1X_MEM_COMPRESS); + if (!ctx->lzo_comp_mem) + return -ENOMEM; + + return 0; +} + +static void lzo_exit(struct crypto_tfm *tfm) +{ + struct lzo_ctx *ctx = crypto_tfm_ctx(tfm); + + vfree(ctx->lzo_comp_mem); +} + +static int lzo_compress(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + struct lzo_ctx *ctx = crypto_tfm_ctx(tfm); + size_t tmp_len = *dlen; /* size_t(ulong) <-> uint on 64 bit */ + int err; + + err = lzo1x_1_compress(src, slen, dst, &tmp_len, ctx->lzo_comp_mem); + + if (err != LZO_E_OK) + return -EINVAL; + + *dlen = tmp_len; + return 0; +} + +static int lzo_decompress(struct crypto_tfm *tfm, const u8 *src, + unsigned int slen, u8 *dst, unsigned int *dlen) +{ + int err; + size_t tmp_len = *dlen; /* size_t(ulong) <-> uint on 64 bit */ + + err = lzo1x_decompress_safe(src, slen, dst, &tmp_len); + + if (err != LZO_E_OK) + return -EINVAL; + + *dlen = tmp_len; + return 0; + +} + +static struct crypto_alg alg = { + .cra_name = "lzo", + .cra_flags = CRYPTO_ALG_TYPE_COMPRESS, + .cra_ctxsize = sizeof(struct lzo_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(alg.cra_list), + .cra_init = lzo_init, + .cra_exit = lzo_exit, + .cra_u = { .compress = { + .coa_compress = lzo_compress, + .coa_decompress = lzo_decompress } } +}; + +static int __init init(void) +{ + return crypto_register_alg(&alg); +} + +static void __exit fini(void) +{ + crypto_unregister_alg(&alg); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("LZO Compression Algorithm"); diff -Nurd linux-2.6.24/crypto/tcrypt.c ubifs-v2.6.24/crypto/tcrypt.c --- linux-2.6.24/crypto/tcrypt.c 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/crypto/tcrypt.c 2009-04-07 17:14:47.000000000 +0200 @@ -78,7 +78,7 @@ "twofish", "serpent", "sha384", "sha512", "md4", "aes", "cast6", "arc4", "michael_mic", "deflate", "crc32c", "tea", "xtea", "khazad", "wp512", "wp384", "wp256", "tnepres", "xeta", "fcrypt", - "camellia", "seed", NULL + "camellia", "seed", "lzo", NULL }; static void hexdump(unsigned char *buf, unsigned int len) @@ -800,7 +800,8 @@ crypto_free_hash(tfm); } -static void test_deflate(void) +static void test_comp(char *algo, struct comp_testvec *ctemplate, + struct comp_testvec *dtemplate, int ctcount, int dtcount) { unsigned int i; char result[COMP_BUF_SIZE]; @@ -808,25 +809,26 @@ struct comp_testvec *tv; unsigned int tsize; - printk("\ntesting deflate compression\n"); + printk("\ntesting %s compression\n", algo); - tsize = sizeof (deflate_comp_tv_template); + tsize = sizeof(struct comp_testvec); + tsize *= ctcount; if (tsize > TVMEMSIZE) { printk("template (%u) too big for tvmem (%u)\n", tsize, TVMEMSIZE); return; } - memcpy(tvmem, deflate_comp_tv_template, tsize); + memcpy(tvmem, ctemplate, tsize); tv = (void *)tvmem; - tfm = crypto_alloc_comp("deflate", 0, CRYPTO_ALG_ASYNC); + tfm = crypto_alloc_comp(algo, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm)) { - printk("failed to load transform for deflate\n"); + printk("failed to load transform for %s\n", algo); return; } - for (i = 0; i < DEFLATE_COMP_TEST_VECTORS; i++) { + for (i = 0; i < ctcount; i++) { int ilen, ret, dlen = COMP_BUF_SIZE; printk("test %u:\n", i + 1); @@ -845,19 +847,20 @@ ilen, dlen); } - printk("\ntesting deflate decompression\n"); + printk("\ntesting %s decompression\n", algo); - tsize = sizeof (deflate_decomp_tv_template); + tsize = sizeof(struct comp_testvec); + tsize *= dtcount; if (tsize > TVMEMSIZE) { printk("template (%u) too big for tvmem (%u)\n", tsize, TVMEMSIZE); goto out; } - memcpy(tvmem, deflate_decomp_tv_template, tsize); + memcpy(tvmem, dtemplate, tsize); tv = (void *)tvmem; - for (i = 0; i < DEFLATE_DECOMP_TEST_VECTORS; i++) { + for (i = 0; i < dtcount; i++) { int ilen, ret, dlen = COMP_BUF_SIZE; printk("test %u:\n", i + 1); @@ -1057,7 +1060,11 @@ test_hash("tgr192", tgr192_tv_template, TGR192_TEST_VECTORS); test_hash("tgr160", tgr160_tv_template, TGR160_TEST_VECTORS); test_hash("tgr128", tgr128_tv_template, TGR128_TEST_VECTORS); - test_deflate(); + test_comp("deflate", deflate_comp_tv_template, + deflate_decomp_tv_template, DEFLATE_COMP_TEST_VECTORS, + DEFLATE_DECOMP_TEST_VECTORS); + test_comp("lzo", lzo_comp_tv_template, lzo_decomp_tv_template, + LZO_COMP_TEST_VECTORS, LZO_DECOMP_TEST_VECTORS); test_hash("crc32c", crc32c_tv_template, CRC32C_TEST_VECTORS); test_hash("hmac(md5)", hmac_md5_tv_template, HMAC_MD5_TEST_VECTORS); @@ -1167,7 +1174,9 @@ break; case 13: - test_deflate(); + test_comp("deflate", deflate_comp_tv_template, + deflate_decomp_tv_template, DEFLATE_COMP_TEST_VECTORS, + DEFLATE_DECOMP_TEST_VECTORS); break; case 14: @@ -1292,6 +1301,11 @@ CAMELLIA_CBC_DEC_TEST_VECTORS); break; + case 33: + test_comp("lzo", lzo_comp_tv_template, lzo_decomp_tv_template, + LZO_COMP_TEST_VECTORS, LZO_DECOMP_TEST_VECTORS); + break; + case 100: test_hash("hmac(md5)", hmac_md5_tv_template, HMAC_MD5_TEST_VECTORS); diff -Nurd linux-2.6.24/crypto/tcrypt.h ubifs-v2.6.24/crypto/tcrypt.h --- linux-2.6.24/crypto/tcrypt.h 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/crypto/tcrypt.h 2009-04-07 17:14:47.000000000 +0200 @@ -4408,6 +4408,88 @@ }; /* + * LZO test vectors (null-terminated strings). + */ +#define LZO_COMP_TEST_VECTORS 2 +#define LZO_DECOMP_TEST_VECTORS 2 + +static struct comp_testvec lzo_comp_tv_template[] = { + { + .inlen = 70, + .outlen = 46, + .input = "Join us now and share the software " + "Join us now and share the software ", + .output = { 0x00, 0x0d, 0x4a, 0x6f, 0x69, 0x6e, 0x20, 0x75, + 0x73, 0x20, 0x6e, 0x6f, 0x77, 0x20, 0x61, 0x6e, + 0x64, 0x20, 0x73, 0x68, 0x61, 0x72, 0x65, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x73, 0x6f, 0x66, 0x74, + 0x77, 0x70, 0x01, 0x01, 0x4a, 0x6f, 0x69, 0x6e, + 0x3d, 0x88, 0x00, 0x11, 0x00, 0x00 }, + }, { + .inlen = 159, + .outlen = 133, + .input = "This document describes a compression method based on the LZO " + "compression algorithm. This document defines the application of " + "the LZO algorithm used in UBIFS.", + .output = { 0x00, 0x2b, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, + 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x20, + 0x64, 0x65, 0x73, 0x63, 0x72, 0x69, 0x62, 0x65, + 0x73, 0x20, 0x61, 0x20, 0x63, 0x6f, 0x6d, 0x70, + 0x72, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, + 0x6d, 0x65, 0x74, 0x68, 0x6f, 0x64, 0x20, 0x62, + 0x61, 0x73, 0x65, 0x64, 0x20, 0x6f, 0x6e, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x4c, 0x5a, 0x4f, 0x2b, + 0x8c, 0x00, 0x0d, 0x61, 0x6c, 0x67, 0x6f, 0x72, + 0x69, 0x74, 0x68, 0x6d, 0x2e, 0x20, 0x20, 0x54, + 0x68, 0x69, 0x73, 0x2a, 0x54, 0x01, 0x02, 0x66, + 0x69, 0x6e, 0x65, 0x73, 0x94, 0x06, 0x05, 0x61, + 0x70, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x76, + 0x0a, 0x6f, 0x66, 0x88, 0x02, 0x60, 0x09, 0x27, + 0xf0, 0x00, 0x0c, 0x20, 0x75, 0x73, 0x65, 0x64, + 0x20, 0x69, 0x6e, 0x20, 0x55, 0x42, 0x49, 0x46, + 0x53, 0x2e, 0x11, 0x00, 0x00 }, + }, +}; + +static struct comp_testvec lzo_decomp_tv_template[] = { + { + .inlen = 133, + .outlen = 159, + .input = { 0x00, 0x2b, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, + 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x20, + 0x64, 0x65, 0x73, 0x63, 0x72, 0x69, 0x62, 0x65, + 0x73, 0x20, 0x61, 0x20, 0x63, 0x6f, 0x6d, 0x70, + 0x72, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, + 0x6d, 0x65, 0x74, 0x68, 0x6f, 0x64, 0x20, 0x62, + 0x61, 0x73, 0x65, 0x64, 0x20, 0x6f, 0x6e, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x4c, 0x5a, 0x4f, 0x2b, + 0x8c, 0x00, 0x0d, 0x61, 0x6c, 0x67, 0x6f, 0x72, + 0x69, 0x74, 0x68, 0x6d, 0x2e, 0x20, 0x20, 0x54, + 0x68, 0x69, 0x73, 0x2a, 0x54, 0x01, 0x02, 0x66, + 0x69, 0x6e, 0x65, 0x73, 0x94, 0x06, 0x05, 0x61, + 0x70, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x76, + 0x0a, 0x6f, 0x66, 0x88, 0x02, 0x60, 0x09, 0x27, + 0xf0, 0x00, 0x0c, 0x20, 0x75, 0x73, 0x65, 0x64, + 0x20, 0x69, 0x6e, 0x20, 0x55, 0x42, 0x49, 0x46, + 0x53, 0x2e, 0x11, 0x00, 0x00 }, + .output = "This document describes a compression method based on the LZO " + "compression algorithm. This document defines the application of " + "the LZO algorithm used in UBIFS.", + }, { + .inlen = 46, + .outlen = 70, + .input = { 0x00, 0x0d, 0x4a, 0x6f, 0x69, 0x6e, 0x20, 0x75, + 0x73, 0x20, 0x6e, 0x6f, 0x77, 0x20, 0x61, 0x6e, + 0x64, 0x20, 0x73, 0x68, 0x61, 0x72, 0x65, 0x20, + 0x74, 0x68, 0x65, 0x20, 0x73, 0x6f, 0x66, 0x74, + 0x77, 0x70, 0x01, 0x01, 0x4a, 0x6f, 0x69, 0x6e, + 0x3d, 0x88, 0x00, 0x11, 0x00, 0x00 }, + .output = "Join us now and share the software " + "Join us now and share the software ", + }, +}; + +/* * Michael MIC test vectors from IEEE 802.11i */ #define MICHAEL_MIC_TEST_VECTORS 6 diff -Nurd linux-2.6.24/drivers/mtd/nand/nandsim.c ubifs-v2.6.24/drivers/mtd/nand/nandsim.c --- linux-2.6.24/drivers/mtd/nand/nandsim.c 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/nand/nandsim.c 2009-04-07 17:14:47.000000000 +0200 @@ -39,6 +39,7 @@ #include #include #include +#include /* Default simulator parameters values */ #if !defined(CONFIG_NANDSIM_FIRST_ID_BYTE) || \ @@ -298,11 +299,11 @@ /* NAND flash "geometry" */ struct nandsin_geometry { - uint32_t totsz; /* total flash size, bytes */ + uint64_t totsz; /* total flash size, bytes */ uint32_t secsz; /* flash sector (erase block) size, bytes */ uint pgsz; /* NAND flash page size, bytes */ uint oobsz; /* page OOB area size, bytes */ - uint32_t totszoob; /* total flash size including OOB, bytes */ + uint64_t totszoob; /* total flash size including OOB, bytes */ uint pgszoob; /* page size including OOB , bytes*/ uint secszoob; /* sector size including OOB, bytes */ uint pgnum; /* total number of pages */ @@ -459,6 +460,12 @@ return kstrdup(buf, GFP_KERNEL); } +static u_int64_t divide(u_int64_t n, u_int32_t d) +{ + do_div(n, d); + return n; +} + /* * Initialize the nandsim structure. * @@ -469,8 +476,8 @@ struct nand_chip *chip = (struct nand_chip *)mtd->priv; struct nandsim *ns = (struct nandsim *)(chip->priv); int i, ret = 0; - u_int32_t remains; - u_int32_t next_offset; + u_int64_t remains; + u_int64_t next_offset; if (NS_IS_INITIALIZED(ns)) { NS_ERR("init_nandsim: nandsim is already initialized\n"); @@ -487,8 +494,8 @@ ns->geom.oobsz = mtd->oobsize; ns->geom.secsz = mtd->erasesize; ns->geom.pgszoob = ns->geom.pgsz + ns->geom.oobsz; - ns->geom.pgnum = ns->geom.totsz / ns->geom.pgsz; - ns->geom.totszoob = ns->geom.totsz + ns->geom.pgnum * ns->geom.oobsz; + ns->geom.pgnum = divide(ns->geom.totsz, ns->geom.pgsz); + ns->geom.totszoob = ns->geom.totsz + (uint64_t)ns->geom.pgnum * ns->geom.oobsz; ns->geom.secshift = ffs(ns->geom.secsz) - 1; ns->geom.pgshift = chip->page_shift; ns->geom.oobshift = ffs(ns->geom.oobsz) - 1; @@ -511,7 +518,7 @@ } if (ns->options & OPT_SMALLPAGE) { - if (ns->geom.totsz < (32 << 20)) { + if (ns->geom.totsz <= (32 << 20)) { ns->geom.pgaddrbytes = 3; ns->geom.secaddrbytes = 2; } else { @@ -537,15 +544,16 @@ remains = ns->geom.totsz; next_offset = 0; for (i = 0; i < parts_num; ++i) { - unsigned long part = parts[i]; - if (!part || part > remains / ns->geom.secsz) { + u_int64_t part_sz = (u_int64_t)parts[i] * ns->geom.secsz; + + if (!part_sz || part_sz > remains) { NS_ERR("bad partition size.\n"); ret = -EINVAL; goto error; } ns->partitions[i].name = get_partition_name(i); ns->partitions[i].offset = next_offset; - ns->partitions[i].size = part * ns->geom.secsz; + ns->partitions[i].size = part_sz; next_offset += ns->partitions[i].size; remains -= ns->partitions[i].size; } @@ -573,7 +581,8 @@ if (ns->busw == 16) NS_WARN("16-bit flashes support wasn't tested\n"); - printk("flash size: %u MiB\n", ns->geom.totsz >> 20); + printk("flash size: %llu MiB\n", + (unsigned long long)ns->geom.totsz >> 20); printk("page size: %u bytes\n", ns->geom.pgsz); printk("OOB area size: %u bytes\n", ns->geom.oobsz); printk("sector size: %u KiB\n", ns->geom.secsz >> 10); @@ -582,8 +591,9 @@ printk("bus width: %u\n", ns->busw); printk("bits in sector size: %u\n", ns->geom.secshift); printk("bits in page size: %u\n", ns->geom.pgshift); - printk("bits in OOB size: %u\n", ns->geom.oobshift); - printk("flash size with OOB: %u KiB\n", ns->geom.totszoob >> 10); + printk("bits in OOB size: %u\n", ns->geom.oobshift); + printk("flash size with OOB: %llu KiB\n", + (unsigned long long)ns->geom.totszoob >> 10); printk("page address bytes: %u\n", ns->geom.pgaddrbytes); printk("sector address bytes: %u\n", ns->geom.secaddrbytes); printk("options: %#x\n", ns->options); @@ -825,7 +835,7 @@ if (!rptwear) return 0; - wear_eb_count = mtd->size / mtd->erasesize; + wear_eb_count = divide(mtd->size, mtd->erasesize); mem = wear_eb_count * sizeof(unsigned long); if (mem / sizeof(unsigned long) != wear_eb_count) { NS_ERR("Too many erase blocks for wear reporting\n"); @@ -2013,7 +2023,7 @@ } if (overridesize) { - u_int32_t new_size = nsmtd->erasesize << overridesize; + u_int64_t new_size = (u_int64_t)nsmtd->erasesize << overridesize; if (new_size >> overridesize != nsmtd->erasesize) { NS_ERR("overridesize is too big\n"); goto err_exit; @@ -2021,7 +2031,8 @@ /* N.B. This relies on nand_scan not doing anything with the size before we change it */ nsmtd->size = new_size; chip->chipsize = new_size; - chip->chip_shift = ffs(new_size) - 1; + chip->chip_shift = ffs(nsmtd->erasesize) + overridesize - 1; + chip->pagemask = (chip->chipsize >> chip->page_shift) - 1; } if ((retval = setup_wear_reporting(nsmtd)) != 0) diff -Nurd linux-2.6.24/drivers/mtd/ubi/Kconfig ubifs-v2.6.24/drivers/mtd/ubi/Kconfig --- linux-2.6.24/drivers/mtd/ubi/Kconfig 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/Kconfig 2009-04-07 17:14:47.000000000 +0200 @@ -24,8 +24,13 @@ erase counter value and the lowest erase counter value of eraseblocks of UBI devices. When this threshold is exceeded, UBI starts performing wear leveling by means of moving data from eraseblock with low erase - counter to eraseblocks with high erase counter. Leave the default - value if unsure. + counter to eraseblocks with high erase counter. + + The default value should be OK for SLC NAND flashes, NOR flashes and + other flashes which have eraseblock life-cycle 100000 or more. + However, in case of MLC NAND flashes which typically have eraseblock + life-cycle less then 10000, the threshold should be lessened (e.g., + to 128 or 256, although it does not have to be power of 2). config MTD_UBI_BEB_RESERVE int "Percentage of reserved eraseblocks for bad eraseblocks handling" diff -Nurd linux-2.6.24/drivers/mtd/ubi/Kconfig.debug ubifs-v2.6.24/drivers/mtd/ubi/Kconfig.debug --- linux-2.6.24/drivers/mtd/ubi/Kconfig.debug 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/Kconfig.debug 2009-04-07 17:14:47.000000000 +0200 @@ -33,16 +33,6 @@ This option switches the background thread off by default. The thread may be also be enabled/disabled via UBI sysfs. -config MTD_UBI_DEBUG_USERSPACE_IO - bool "Direct user-space write/erase support" - default n - depends on MTD_UBI_DEBUG - help - By default, users cannot directly write and erase individual - eraseblocks of dynamic volumes, and have to use update operation - instead. This option enables this capability - it is very useful for - debugging and testing. - config MTD_UBI_DEBUG_EMULATE_BITFLIPS bool "Emulate flash bit-flips" depends on MTD_UBI_DEBUG diff -Nurd linux-2.6.24/drivers/mtd/ubi/build.c ubifs-v2.6.24/drivers/mtd/ubi/build.c --- linux-2.6.24/drivers/mtd/ubi/build.c 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/build.c 2009-04-07 17:14:47.000000000 +0200 @@ -21,11 +21,16 @@ */ /* - * This file includes UBI initialization and building of UBI devices. At the - * moment UBI devices may only be added while UBI is initialized, but dynamic - * device add/remove functionality is planned. Also, at the moment we only - * attach UBI devices by scanning, which will become a bottleneck when flashes - * reach certain large size. Then one may improve UBI and add other methods. + * This file includes UBI initialization and building of UBI devices. + * + * When UBI is initialized, it attaches all the MTD devices specified as the + * module load parameters or the kernel boot parameters. If MTD devices were + * specified, UBI does not attach any MTD device, but it is possible to do + * later using the "UBI control device". + * + * At the moment we only attach UBI devices by scanning, which will become a + * bottleneck when flashes reach certain large size. Then one may improve UBI + * and add other methods, although it does not seem to be easy to do. */ #include @@ -33,7 +38,9 @@ #include #include #include +#include #include +#include #include "ubi.h" /* Maximum length of the 'mtd=' parameter */ @@ -43,29 +50,39 @@ * struct mtd_dev_param - MTD device parameter description data structure. * @name: MTD device name or number string * @vid_hdr_offs: VID header offset - * @data_offs: data offset */ -struct mtd_dev_param -{ +struct mtd_dev_param { char name[MTD_PARAM_LEN_MAX]; int vid_hdr_offs; - int data_offs; }; /* Numbers of elements set in the @mtd_dev_param array */ -static int mtd_devs = 0; +static int mtd_devs; /* MTD devices specification parameters */ static struct mtd_dev_param mtd_dev_param[UBI_MAX_DEVICES]; -/* Number of UBI devices in system */ -int ubi_devices_cnt; +/* Root UBI "class" object (corresponds to '//class/ubi/') */ +struct class *ubi_class; + +/* Slab cache for wear-leveling entries */ +struct kmem_cache *ubi_wl_entry_slab; + +/* UBI control character device */ +static struct miscdevice ubi_ctrl_cdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "ubi_ctrl", + .fops = &ubi_ctrl_cdev_operations, +}; /* All UBI devices in system */ -struct ubi_device *ubi_devices[UBI_MAX_DEVICES]; +static struct ubi_device *ubi_devices[UBI_MAX_DEVICES]; -/* Root UBI "class" object (corresponds to '//class/ubi/') */ -struct class *ubi_class; +/* Serializes UBI devices creations and removals */ +DEFINE_MUTEX(ubi_devices_mutex); + +/* Protects @ubi_devices and @ubi->ref_count */ +static DEFINE_SPINLOCK(ubi_devices_lock); /* "Show" method for files in '//class/ubi/' */ static ssize_t ubi_version_show(struct class *class, char *buf) @@ -101,42 +118,157 @@ __ATTR(min_io_size, S_IRUGO, dev_attribute_show, NULL); static struct device_attribute dev_bgt_enabled = __ATTR(bgt_enabled, S_IRUGO, dev_attribute_show, NULL); +static struct device_attribute dev_mtd_num = + __ATTR(mtd_num, S_IRUGO, dev_attribute_show, NULL); + +/** + * ubi_get_device - get UBI device. + * @ubi_num: UBI device number + * + * This function returns UBI device description object for UBI device number + * @ubi_num, or %NULL if the device does not exist. This function increases the + * device reference count to prevent removal of the device. In other words, the + * device cannot be removed if its reference count is not zero. + */ +struct ubi_device *ubi_get_device(int ubi_num) +{ + struct ubi_device *ubi; + + spin_lock(&ubi_devices_lock); + ubi = ubi_devices[ubi_num]; + if (ubi) { + ubi_assert(ubi->ref_count >= 0); + ubi->ref_count += 1; + get_device(&ubi->dev); + } + spin_unlock(&ubi_devices_lock); + + return ubi; +} + +/** + * ubi_put_device - drop an UBI device reference. + * @ubi: UBI device description object + */ +void ubi_put_device(struct ubi_device *ubi) +{ + spin_lock(&ubi_devices_lock); + ubi->ref_count -= 1; + put_device(&ubi->dev); + spin_unlock(&ubi_devices_lock); +} + +/** + * ubi_get_by_major - get UBI device by character device major number. + * @major: major number + * + * This function is similar to 'ubi_get_device()', but it searches the device + * by its major number. + */ +struct ubi_device *ubi_get_by_major(int major) +{ + int i; + struct ubi_device *ubi; + + spin_lock(&ubi_devices_lock); + for (i = 0; i < UBI_MAX_DEVICES; i++) { + ubi = ubi_devices[i]; + if (ubi && MAJOR(ubi->cdev.dev) == major) { + ubi_assert(ubi->ref_count >= 0); + ubi->ref_count += 1; + get_device(&ubi->dev); + spin_unlock(&ubi_devices_lock); + return ubi; + } + } + spin_unlock(&ubi_devices_lock); + + return NULL; +} + +/** + * ubi_major2num - get UBI device number by character device major number. + * @major: major number + * + * This function searches UBI device number object by its major number. If UBI + * device was not found, this function returns -ENODEV, otherwise the UBI device + * number is returned. + */ +int ubi_major2num(int major) +{ + int i, ubi_num = -ENODEV; + + spin_lock(&ubi_devices_lock); + for (i = 0; i < UBI_MAX_DEVICES; i++) { + struct ubi_device *ubi = ubi_devices[i]; + + if (ubi && MAJOR(ubi->cdev.dev) == major) { + ubi_num = ubi->ubi_num; + break; + } + } + spin_unlock(&ubi_devices_lock); + + return ubi_num; +} /* "Show" method for files in '//class/ubi/ubiX/' */ static ssize_t dev_attribute_show(struct device *dev, struct device_attribute *attr, char *buf) { - const struct ubi_device *ubi; + ssize_t ret; + struct ubi_device *ubi; + /* + * The below code looks weird, but it actually makes sense. We get the + * UBI device reference from the contained 'struct ubi_device'. But it + * is unclear if the device was removed or not yet. Indeed, if the + * device was removed before we increased its reference count, + * 'ubi_get_device()' will return -ENODEV and we fail. + * + * Remember, 'struct ubi_device' is freed in the release function, so + * we still can use 'ubi->ubi_num'. + */ ubi = container_of(dev, struct ubi_device, dev); + ubi = ubi_get_device(ubi->ubi_num); + if (!ubi) + return -ENODEV; + if (attr == &dev_eraseblock_size) - return sprintf(buf, "%d\n", ubi->leb_size); + ret = sprintf(buf, "%d\n", ubi->leb_size); else if (attr == &dev_avail_eraseblocks) - return sprintf(buf, "%d\n", ubi->avail_pebs); + ret = sprintf(buf, "%d\n", ubi->avail_pebs); else if (attr == &dev_total_eraseblocks) - return sprintf(buf, "%d\n", ubi->good_peb_count); + ret = sprintf(buf, "%d\n", ubi->good_peb_count); else if (attr == &dev_volumes_count) - return sprintf(buf, "%d\n", ubi->vol_count); + ret = sprintf(buf, "%d\n", ubi->vol_count - UBI_INT_VOL_COUNT); else if (attr == &dev_max_ec) - return sprintf(buf, "%d\n", ubi->max_ec); + ret = sprintf(buf, "%d\n", ubi->max_ec); else if (attr == &dev_reserved_for_bad) - return sprintf(buf, "%d\n", ubi->beb_rsvd_pebs); + ret = sprintf(buf, "%d\n", ubi->beb_rsvd_pebs); else if (attr == &dev_bad_peb_count) - return sprintf(buf, "%d\n", ubi->bad_peb_count); + ret = sprintf(buf, "%d\n", ubi->bad_peb_count); else if (attr == &dev_max_vol_count) - return sprintf(buf, "%d\n", ubi->vtbl_slots); + ret = sprintf(buf, "%d\n", ubi->vtbl_slots); else if (attr == &dev_min_io_size) - return sprintf(buf, "%d\n", ubi->min_io_size); + ret = sprintf(buf, "%d\n", ubi->min_io_size); else if (attr == &dev_bgt_enabled) - return sprintf(buf, "%d\n", ubi->thread_enabled); + ret = sprintf(buf, "%d\n", ubi->thread_enabled); + else if (attr == &dev_mtd_num) + ret = sprintf(buf, "%d\n", ubi->mtd->index); else - BUG(); + ret = -EINVAL; - return 0; + ubi_put_device(ubi); + return ret; } -/* Fake "release" method for UBI devices */ -static void dev_release(struct device *dev) { } +static void dev_release(struct device *dev) +{ + struct ubi_device *ubi = container_of(dev, struct ubi_device, dev); + + kfree(ubi); +} /** * ubi_sysfs_init - initialize sysfs for an UBI device. @@ -150,68 +282,44 @@ int err; ubi->dev.release = dev_release; - ubi->dev.devt = MKDEV(ubi->major, 0); + ubi->dev.devt = ubi->cdev.dev; ubi->dev.class = ubi_class; sprintf(&ubi->dev.bus_id[0], UBI_NAME_STR"%d", ubi->ubi_num); err = device_register(&ubi->dev); if (err) - goto out; + return err; err = device_create_file(&ubi->dev, &dev_eraseblock_size); if (err) - goto out_unregister; + return err; err = device_create_file(&ubi->dev, &dev_avail_eraseblocks); if (err) - goto out_eraseblock_size; + return err; err = device_create_file(&ubi->dev, &dev_total_eraseblocks); if (err) - goto out_avail_eraseblocks; + return err; err = device_create_file(&ubi->dev, &dev_volumes_count); if (err) - goto out_total_eraseblocks; + return err; err = device_create_file(&ubi->dev, &dev_max_ec); if (err) - goto out_volumes_count; + return err; err = device_create_file(&ubi->dev, &dev_reserved_for_bad); if (err) - goto out_volumes_max_ec; + return err; err = device_create_file(&ubi->dev, &dev_bad_peb_count); if (err) - goto out_reserved_for_bad; + return err; err = device_create_file(&ubi->dev, &dev_max_vol_count); if (err) - goto out_bad_peb_count; + return err; err = device_create_file(&ubi->dev, &dev_min_io_size); if (err) - goto out_max_vol_count; + return err; err = device_create_file(&ubi->dev, &dev_bgt_enabled); if (err) - goto out_min_io_size; - - return 0; - -out_min_io_size: - device_remove_file(&ubi->dev, &dev_min_io_size); -out_max_vol_count: - device_remove_file(&ubi->dev, &dev_max_vol_count); -out_bad_peb_count: - device_remove_file(&ubi->dev, &dev_bad_peb_count); -out_reserved_for_bad: - device_remove_file(&ubi->dev, &dev_reserved_for_bad); -out_volumes_max_ec: - device_remove_file(&ubi->dev, &dev_max_ec); -out_volumes_count: - device_remove_file(&ubi->dev, &dev_volumes_count); -out_total_eraseblocks: - device_remove_file(&ubi->dev, &dev_total_eraseblocks); -out_avail_eraseblocks: - device_remove_file(&ubi->dev, &dev_avail_eraseblocks); -out_eraseblock_size: - device_remove_file(&ubi->dev, &dev_eraseblock_size); -out_unregister: - device_unregister(&ubi->dev); -out: - ubi_err("failed to initialize sysfs for %s", ubi->ubi_name); + return err; + err = device_create_file(&ubi->dev, &dev_mtd_num); return err; } @@ -221,6 +329,7 @@ */ static void ubi_sysfs_close(struct ubi_device *ubi) { + device_remove_file(&ubi->dev, &dev_mtd_num); device_remove_file(&ubi->dev, &dev_bgt_enabled); device_remove_file(&ubi->dev, &dev_min_io_size); device_remove_file(&ubi->dev, &dev_max_vol_count); @@ -244,7 +353,26 @@ for (i = 0; i < ubi->vtbl_slots; i++) if (ubi->volumes[i]) - ubi_free_volume(ubi, i); + ubi_free_volume(ubi, ubi->volumes[i]); +} + +/** + * free_user_volumes - free all user volumes. + * @ubi: UBI device description object + * + * Normally the volumes are freed at the release function of the volume device + * objects. However, on error paths the volumes have to be freed before the + * device objects have been initialized. + */ +static void free_user_volumes(struct ubi_device *ubi) +{ + int i; + + for (i = 0; i < ubi->vtbl_slots; i++) + if (ubi->volumes[i]) { + kfree(ubi->volumes[i]->eba_tbl); + kfree(ubi->volumes[i]); + } } /** @@ -252,16 +380,13 @@ * @ubi: UBI device description object * * This function returns zero in case of success and a negative error code in - * case of failure. + * case of failure. Note, this function destroys all volumes if it failes. */ static int uif_init(struct ubi_device *ubi) { int i, err; dev_t dev; - mutex_init(&ubi->vtbl_mutex); - spin_lock_init(&ubi->volumes_lock); - sprintf(ubi->ubi_name, UBI_NAME_STR "%d", ubi->ubi_num); /* @@ -278,52 +403,72 @@ return err; } + ubi_assert(MINOR(dev) == 0); cdev_init(&ubi->cdev, &ubi_cdev_operations); - ubi->major = MAJOR(dev); - dbg_msg("%s major is %u", ubi->ubi_name, ubi->major); + dbg_gen("%s major is %u", ubi->ubi_name, MAJOR(dev)); ubi->cdev.owner = THIS_MODULE; - dev = MKDEV(ubi->major, 0); err = cdev_add(&ubi->cdev, dev, 1); if (err) { - ubi_err("cannot add character device %s", ubi->ubi_name); + ubi_err("cannot add character device"); goto out_unreg; } err = ubi_sysfs_init(ubi); if (err) - goto out_cdev; + goto out_sysfs; for (i = 0; i < ubi->vtbl_slots; i++) if (ubi->volumes[i]) { - err = ubi_add_volume(ubi, i); - if (err) + err = ubi_add_volume(ubi, ubi->volumes[i]); + if (err) { + ubi_err("cannot add volume %d", i); goto out_volumes; + } } return 0; out_volumes: kill_volumes(ubi); +out_sysfs: ubi_sysfs_close(ubi); -out_cdev: cdev_del(&ubi->cdev); out_unreg: - unregister_chrdev_region(MKDEV(ubi->major, 0), - ubi->vtbl_slots + 1); + unregister_chrdev_region(ubi->cdev.dev, ubi->vtbl_slots + 1); + ubi_err("cannot initialize UBI %s, error %d", ubi->ubi_name, err); return err; } /** * uif_close - close user interfaces for an UBI device. * @ubi: UBI device description object + * + * Note, since this function un-registers UBI volume device objects (@vol->dev), + * the memory allocated voe the volumes is freed as well (in the release + * function). */ static void uif_close(struct ubi_device *ubi) { kill_volumes(ubi); ubi_sysfs_close(ubi); cdev_del(&ubi->cdev); - unregister_chrdev_region(MKDEV(ubi->major, 0), ubi->vtbl_slots + 1); + unregister_chrdev_region(ubi->cdev.dev, ubi->vtbl_slots + 1); +} + +/** + * free_internal_volumes - free internal volumes. + * @ubi: UBI device description object + */ +static void free_internal_volumes(struct ubi_device *ubi) +{ + int i; + + for (i = ubi->vtbl_slots; + i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) { + kfree(ubi->volumes[i]->eba_tbl); + kfree(ubi->volumes[i]); + } } /** @@ -370,6 +515,7 @@ out_wl: ubi_wl_close(ubi); out_vtbl: + free_internal_volumes(ubi); vfree(ubi->vtbl); out_si: ubi_scan_destroy_si(si); @@ -377,16 +523,16 @@ } /** - * io_init - initialize I/O unit for a given UBI device. + * io_init - initialize I/O sub-system for a given UBI device. * @ubi: UBI device description object * * If @ubi->vid_hdr_offset or @ubi->leb_start is zero, default offsets are * assumed: * o EC header is always at offset zero - this cannot be changed; * o VID header starts just after the EC header at the closest address - * aligned to @io->@hdrs_min_io_size; + * aligned to @io->hdrs_min_io_size; * o data starts just after the VID header at the closest address aligned to - * @io->@min_io_size + * @io->min_io_size * * This function returns zero in case of success and a negative error code in * case of failure. @@ -407,6 +553,9 @@ return -EINVAL; } + if (ubi->vid_hdr_offset < 0) + return -EINVAL; + /* * Note, in this implementation we support MTD devices with 0x7FFFFFFF * physical eraseblocks maximum. @@ -422,9 +571,14 @@ ubi->min_io_size = ubi->mtd->writesize; ubi->hdrs_min_io_size = ubi->mtd->writesize >> ubi->mtd->subpage_sft; - /* Make sure minimal I/O unit is power of 2 */ + /* + * Make sure minimal I/O unit is power of 2. Note, there is no + * fundamental reason for this assumption. It is just an optimization + * which allows us to avoid costly division operations. + */ if (!is_power_of_2(ubi->min_io_size)) { - ubi_err("bad min. I/O unit"); + ubi_err("min. I/O unit (%d) is not power of 2", + ubi->min_io_size); return -EINVAL; } @@ -453,10 +607,8 @@ } /* Similar for the data offset */ - if (ubi->leb_start == 0) { - ubi->leb_start = ubi->vid_hdr_offset + ubi->vid_hdr_alsize; - ubi->leb_start = ALIGN(ubi->leb_start, ubi->min_io_size); - } + ubi->leb_start = ubi->vid_hdr_offset + UBI_EC_HDR_SIZE; + ubi->leb_start = ALIGN(ubi->leb_start, ubi->min_io_size); dbg_msg("vid_hdr_offset %d", ubi->vid_hdr_offset); dbg_msg("vid_hdr_aloffset %d", ubi->vid_hdr_aloffset); @@ -474,7 +626,7 @@ if (ubi->vid_hdr_offset < UBI_EC_HDR_SIZE || ubi->leb_start < ubi->vid_hdr_offset + UBI_VID_HDR_SIZE || ubi->leb_start > ubi->peb_size - UBI_VID_HDR_SIZE || - ubi->leb_start % ubi->min_io_size) { + ubi->leb_start & (ubi->min_io_size - 1)) { ubi_err("bad VID header (%d) or data offsets (%d)", ubi->vid_hdr_offset, ubi->leb_start); return -EINVAL; @@ -499,8 +651,16 @@ ubi->ro_mode = 1; } - dbg_msg("leb_size %d", ubi->leb_size); - dbg_msg("ro_mode %d", ubi->ro_mode); + ubi_msg("physical eraseblock size: %d bytes (%d KiB)", + ubi->peb_size, ubi->peb_size >> 10); + ubi_msg("logical eraseblock size: %d bytes", ubi->leb_size); + ubi_msg("smallest flash I/O unit: %d", ubi->min_io_size); + if (ubi->hdrs_min_io_size != ubi->min_io_size) + ubi_msg("sub-page size: %d", + ubi->hdrs_min_io_size); + ubi_msg("VID header offset: %d (aligned %d)", + ubi->vid_hdr_offset, ubi->vid_hdr_aloffset); + ubi_msg("data offset: %d", ubi->leb_start); /* * Note, ideally, we have to initialize ubi->bad_peb_count here. But @@ -514,89 +674,162 @@ } /** - * attach_mtd_dev - attach an MTD device. - * @mtd_dev: MTD device name or number string - * @vid_hdr_offset: VID header offset - * @data_offset: data offset + * autoresize - re-size the volume which has the "auto-resize" flag set. + * @ubi: UBI device description object + * @vol_id: ID of the volume to re-size * - * This function attaches an MTD device to UBI. It first treats @mtd_dev as the - * MTD device name, and tries to open it by this name. If it is unable to open, - * it tries to convert @mtd_dev to an integer and open the MTD device by its - * number. Returns zero in case of success and a negative error code in case of - * failure. + * This function re-sizes the volume marked by the @UBI_VTBL_AUTORESIZE_FLG in + * the volume table to the largest possible size. See comments in ubi-header.h + * for more description of the flag. Returns zero in case of success and a + * negative error code in case of failure. */ -static int attach_mtd_dev(const char *mtd_dev, int vid_hdr_offset, - int data_offset) +static int autoresize(struct ubi_device *ubi, int vol_id) { - struct ubi_device *ubi; - struct mtd_info *mtd; - int i, err; + struct ubi_volume_desc desc; + struct ubi_volume *vol = ubi->volumes[vol_id]; + int err, old_reserved_pebs = vol->reserved_pebs; - mtd = get_mtd_device_nm(mtd_dev); - if (IS_ERR(mtd)) { - int mtd_num; - char *endp; + /* + * Clear the auto-resize flag in the volume in-memory copy of the + * volume table, and 'ubi_resize_volume()' will propagate this change + * to the flash. + */ + ubi->vtbl[vol_id].flags &= ~UBI_VTBL_AUTORESIZE_FLG; - if (PTR_ERR(mtd) != -ENODEV) - return PTR_ERR(mtd); + if (ubi->avail_pebs == 0) { + struct ubi_vtbl_record vtbl_rec; /* - * Probably this is not MTD device name but MTD device number - - * check this out. + * No available PEBs to re-size the volume, clear the flag on + * flash and exit. */ - mtd_num = simple_strtoul(mtd_dev, &endp, 0); - if (*endp != '\0' || mtd_dev == endp) { - ubi_err("incorrect MTD device: \"%s\"", mtd_dev); - return -ENODEV; + memcpy(&vtbl_rec, &ubi->vtbl[vol_id], + sizeof(struct ubi_vtbl_record)); + err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); + if (err) + ubi_err("cannot clean auto-resize flag for volume %d", + vol_id); + } else { + desc.vol = vol; + err = ubi_resize_volume(&desc, + old_reserved_pebs + ubi->avail_pebs); + if (err) + ubi_err("cannot auto-resize volume %d", vol_id); + } + + if (err) + return err; + + ubi_msg("volume %d (\"%s\") re-sized from %d to %d LEBs", vol_id, + vol->name, old_reserved_pebs, vol->reserved_pebs); + return 0; +} + +/** + * ubi_attach_mtd_dev - attach an MTD device. + * @mtd: MTD device description object + * @ubi_num: number to assign to the new UBI device + * @vid_hdr_offset: VID header offset + * + * This function attaches MTD device @mtd_dev to UBI and assign @ubi_num number + * to the newly created UBI device, unless @ubi_num is %UBI_DEV_NUM_AUTO, in + * which case this function finds a vacant device number and assigns it + * automatically. Returns the new UBI device number in case of success and a + * negative error code in case of failure. + * + * Note, the invocations of this function has to be serialized by the + * @ubi_devices_mutex. + */ +int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, int vid_hdr_offset) +{ + struct ubi_device *ubi; + int i, err, do_free = 1; + + /* + * Check if we already have the same MTD device attached. + * + * Note, this function assumes that UBI devices creations and deletions + * are serialized, so it does not take the &ubi_devices_lock. + */ + for (i = 0; i < UBI_MAX_DEVICES; i++) { + ubi = ubi_devices[i]; + if (ubi && mtd->index == ubi->mtd->index) { + dbg_err("mtd%d is already attached to ubi%d", + mtd->index, i); + return -EEXIST; } + } - mtd = get_mtd_device(NULL, mtd_num); - if (IS_ERR(mtd)) - return PTR_ERR(mtd); + /* + * Make sure this MTD device is not emulated on top of an UBI volume + * already. Well, generally this recursion works fine, but there are + * different problems like the UBI module takes a reference to itself + * by attaching (and thus, opening) the emulated MTD device. This + * results in inability to unload the module. And in general it makes + * no sense to attach emulated MTD devices, so we prohibit this. + */ + if (mtd->type == MTD_UBIVOLUME) { + ubi_err("refuse attaching mtd%d - it is already emulated on " + "top of UBI", mtd->index); + return -EINVAL; } - /* Check if we already have the same MTD device attached */ - for (i = 0; i < ubi_devices_cnt; i++) - if (ubi_devices[i]->mtd->index == mtd->index) { - ubi_err("mtd%d is already attached to ubi%d", - mtd->index, i); - err = -EINVAL; - goto out_mtd; + if (ubi_num == UBI_DEV_NUM_AUTO) { + /* Search for an empty slot in the @ubi_devices array */ + for (ubi_num = 0; ubi_num < UBI_MAX_DEVICES; ubi_num++) + if (!ubi_devices[ubi_num]) + break; + if (ubi_num == UBI_MAX_DEVICES) { + dbg_err("only %d UBI devices may be created", + UBI_MAX_DEVICES); + return -ENFILE; } + } else { + if (ubi_num >= UBI_MAX_DEVICES) + return -EINVAL; - ubi = ubi_devices[ubi_devices_cnt] = kzalloc(sizeof(struct ubi_device), - GFP_KERNEL); - if (!ubi) { - err = -ENOMEM; - goto out_mtd; + /* Make sure ubi_num is not busy */ + if (ubi_devices[ubi_num]) { + dbg_err("ubi%d already exists", ubi_num); + return -EEXIST; + } } - ubi->ubi_num = ubi_devices_cnt; + ubi = kzalloc(sizeof(struct ubi_device), GFP_KERNEL); + if (!ubi) + return -ENOMEM; + ubi->mtd = mtd; + ubi->ubi_num = ubi_num; + ubi->vid_hdr_offset = vid_hdr_offset; + ubi->autoresize_vol_id = -1; - dbg_msg("attaching mtd%d to ubi%d: VID header offset %d data offset %d", - ubi->mtd->index, ubi_devices_cnt, vid_hdr_offset, data_offset); + mutex_init(&ubi->buf_mutex); + mutex_init(&ubi->ckvol_mutex); + mutex_init(&ubi->mult_mutex); + mutex_init(&ubi->volumes_mutex); + spin_lock_init(&ubi->volumes_lock); + + ubi_msg("attaching mtd%d to ubi%d", mtd->index, ubi_num); - ubi->vid_hdr_offset = vid_hdr_offset; - ubi->leb_start = data_offset; err = io_init(ubi); if (err) goto out_free; - mutex_init(&ubi->buf_mutex); + err = -ENOMEM; ubi->peb_buf1 = vmalloc(ubi->peb_size); if (!ubi->peb_buf1) goto out_free; ubi->peb_buf2 = vmalloc(ubi->peb_size); if (!ubi->peb_buf2) - goto out_free; + goto out_free; #ifdef CONFIG_MTD_UBI_DEBUG mutex_init(&ubi->dbg_buf_mutex); ubi->dbg_peb_buf = vmalloc(ubi->peb_size); if (!ubi->dbg_peb_buf) - goto out_free; + goto out_free; #endif err = attach_by_scanning(ubi); @@ -605,22 +838,29 @@ goto out_free; } + if (ubi->autoresize_vol_id != -1) { + err = autoresize(ubi, ubi->autoresize_vol_id); + if (err) + goto out_detach; + } + err = uif_init(ubi); if (err) - goto out_detach; + goto out_nofree; - ubi_msg("attached mtd%d to ubi%d", ubi->mtd->index, ubi_devices_cnt); - ubi_msg("MTD device name: \"%s\"", ubi->mtd->name); + ubi->bgt_thread = kthread_create(ubi_thread, ubi, ubi->bgt_name); + if (IS_ERR(ubi->bgt_thread)) { + err = PTR_ERR(ubi->bgt_thread); + ubi_err("cannot spawn \"%s\", error %d", ubi->bgt_name, + err); + goto out_uif; + } + + ubi_msg("attached mtd%d to ubi%d", mtd->index, ubi_num); + ubi_msg("MTD device name: \"%s\"", mtd->name); ubi_msg("MTD device size: %llu MiB", ubi->flash_size >> 20); - ubi_msg("physical eraseblock size: %d bytes (%d KiB)", - ubi->peb_size, ubi->peb_size >> 10); - ubi_msg("logical eraseblock size: %d bytes", ubi->leb_size); ubi_msg("number of good PEBs: %d", ubi->good_peb_count); ubi_msg("number of bad PEBs: %d", ubi->bad_peb_count); - ubi_msg("smallest flash I/O unit: %d", ubi->min_io_size); - ubi_msg("VID header offset: %d (aligned %d)", - ubi->vid_hdr_offset, ubi->vid_hdr_aloffset); - ubi_msg("data offset: %d", ubi->leb_start); ubi_msg("max. allowed volumes: %d", ubi->vtbl_slots); ubi_msg("wear-leveling threshold: %d", CONFIG_MTD_UBI_WL_THRESHOLD); ubi_msg("number of internal volumes: %d", UBI_INT_VOL_COUNT); @@ -632,18 +872,22 @@ ubi->beb_rsvd_pebs); ubi_msg("max/mean erase counter: %d/%d", ubi->max_ec, ubi->mean_ec); - /* Enable the background thread */ - if (!DBG_DISABLE_BGT) { + if (!DBG_DISABLE_BGT) ubi->thread_enabled = 1; - wake_up_process(ubi->bgt_thread); - } + wake_up_process(ubi->bgt_thread); - ubi_devices_cnt += 1; - return 0; + ubi_devices[ubi_num] = ubi; + return ubi_num; +out_uif: + uif_close(ubi); +out_nofree: + do_free = 0; out_detach: - ubi_eba_close(ubi); ubi_wl_close(ubi); + if (do_free) + free_user_volumes(ubi); + free_internal_volumes(ubi); vfree(ubi->vtbl); out_free: vfree(ubi->peb_buf1); @@ -652,24 +896,67 @@ vfree(ubi->dbg_peb_buf); #endif kfree(ubi); -out_mtd: - put_mtd_device(mtd); - ubi_devices[ubi_devices_cnt] = NULL; return err; } /** - * detach_mtd_dev - detach an MTD device. - * @ubi: UBI device description object + * ubi_detach_mtd_dev - detach an MTD device. + * @ubi_num: UBI device number to detach from + * @anyway: detach MTD even if device reference count is not zero + * + * This function destroys an UBI device number @ubi_num and detaches the + * underlying MTD device. Returns zero in case of success and %-EBUSY if the + * UBI device is busy and cannot be destroyed, and %-EINVAL if it does not + * exist. + * + * Note, the invocations of this function has to be serialized by the + * @ubi_devices_mutex. */ -static void detach_mtd_dev(struct ubi_device *ubi) +int ubi_detach_mtd_dev(int ubi_num, int anyway) { - int ubi_num = ubi->ubi_num, mtd_num = ubi->mtd->index; + struct ubi_device *ubi; + if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES) + return -EINVAL; + + spin_lock(&ubi_devices_lock); + ubi = ubi_devices[ubi_num]; + if (!ubi) { + spin_unlock(&ubi_devices_lock); + return -EINVAL; + } + + if (ubi->ref_count) { + if (!anyway) { + spin_unlock(&ubi_devices_lock); + return -EBUSY; + } + /* This may only happen if there is a bug */ + ubi_err("%s reference count %d, destroy anyway", + ubi->ubi_name, ubi->ref_count); + } + ubi_devices[ubi_num] = NULL; + spin_unlock(&ubi_devices_lock); + + ubi_assert(ubi_num == ubi->ubi_num); dbg_msg("detaching mtd%d from ubi%d", ubi->mtd->index, ubi_num); + + /* + * Before freeing anything, we have to stop the background thread to + * prevent it from doing anything on this device while we are freeing. + */ + if (ubi->bgt_thread) + kthread_stop(ubi->bgt_thread); + + /* + * Get a reference to the device in order to prevent 'dev_release()' + * from freeing @ubi object. + */ + get_device(&ubi->dev); + uif_close(ubi); - ubi_eba_close(ubi); ubi_wl_close(ubi); + free_internal_volumes(ubi); vfree(ubi->vtbl); put_mtd_device(ubi->mtd); vfree(ubi->peb_buf1); @@ -677,11 +964,37 @@ #ifdef CONFIG_MTD_UBI_DEBUG vfree(ubi->dbg_peb_buf); #endif - kfree(ubi_devices[ubi_num]); - ubi_devices[ubi_num] = NULL; - ubi_devices_cnt -= 1; - ubi_assert(ubi_devices_cnt >= 0); - ubi_msg("mtd%d is detached from ubi%d", mtd_num, ubi_num); + ubi_msg("mtd%d is detached from ubi%d", ubi->mtd->index, ubi->ubi_num); + put_device(&ubi->dev); + return 0; +} + +/** + * find_mtd_device - open an MTD device by its name or number. + * @mtd_dev: name or number of the device + * + * This function tries to open and MTD device described by @mtd_dev string, + * which is first treated as an ASCII number, and if it is not true, it is + * treated as MTD device name. Returns MTD device description object in case of + * success and a negative error code in case of failure. + */ +static struct mtd_info * __init open_mtd_device(const char *mtd_dev) +{ + struct mtd_info *mtd; + int mtd_num; + char *endp; + + mtd_num = simple_strtoul(mtd_dev, &endp, 0); + if (*endp != '\0' || mtd_dev == endp) { + /* + * This does not look like an ASCII integer, probably this is + * MTD device name. + */ + mtd = get_mtd_device_nm(mtd_dev); + } else + mtd = get_mtd_device(NULL, mtd_num); + + return mtd; } static int __init ubi_init(void) @@ -693,55 +1006,101 @@ BUILD_BUG_ON(sizeof(struct ubi_vid_hdr) != 64); if (mtd_devs > UBI_MAX_DEVICES) { - printk("UBI error: too many MTD devices, maximum is %d\n", - UBI_MAX_DEVICES); + ubi_err("too many MTD devices, maximum is %d", UBI_MAX_DEVICES); return -EINVAL; } + /* Create base sysfs directory and sysfs files */ ubi_class = class_create(THIS_MODULE, UBI_NAME_STR); - if (IS_ERR(ubi_class)) - return PTR_ERR(ubi_class); + if (IS_ERR(ubi_class)) { + err = PTR_ERR(ubi_class); + ubi_err("cannot create UBI class"); + goto out; + } err = class_create_file(ubi_class, &ubi_version); - if (err) + if (err) { + ubi_err("cannot create sysfs file"); goto out_class; + } + + err = misc_register(&ubi_ctrl_cdev); + if (err) { + ubi_err("cannot register device"); + goto out_version; + } + + ubi_wl_entry_slab = kmem_cache_create("ubi_wl_entry_slab", + sizeof(struct ubi_wl_entry), + 0, 0, NULL); + if (!ubi_wl_entry_slab) + goto out_dev_unreg; /* Attach MTD devices */ for (i = 0; i < mtd_devs; i++) { struct mtd_dev_param *p = &mtd_dev_param[i]; + struct mtd_info *mtd; cond_resched(); - err = attach_mtd_dev(p->name, p->vid_hdr_offs, p->data_offs); - if (err) + + mtd = open_mtd_device(p->name); + if (IS_ERR(mtd)) { + err = PTR_ERR(mtd); goto out_detach; + } + + mutex_lock(&ubi_devices_mutex); + err = ubi_attach_mtd_dev(mtd, UBI_DEV_NUM_AUTO, + p->vid_hdr_offs); + mutex_unlock(&ubi_devices_mutex); + if (err < 0) { + put_mtd_device(mtd); + ubi_err("cannot attach mtd%d", mtd->index); + goto out_detach; + } } return 0; out_detach: for (k = 0; k < i; k++) - detach_mtd_dev(ubi_devices[k]); + if (ubi_devices[k]) { + mutex_lock(&ubi_devices_mutex); + ubi_detach_mtd_dev(ubi_devices[k]->ubi_num, 1); + mutex_unlock(&ubi_devices_mutex); + } + kmem_cache_destroy(ubi_wl_entry_slab); +out_dev_unreg: + misc_deregister(&ubi_ctrl_cdev); +out_version: class_remove_file(ubi_class, &ubi_version); out_class: class_destroy(ubi_class); +out: + ubi_err("UBI error: cannot initialize UBI, error %d", err); return err; } module_init(ubi_init); static void __exit ubi_exit(void) { - int i, n = ubi_devices_cnt; + int i; - for (i = 0; i < n; i++) - detach_mtd_dev(ubi_devices[i]); + for (i = 0; i < UBI_MAX_DEVICES; i++) + if (ubi_devices[i]) { + mutex_lock(&ubi_devices_mutex); + ubi_detach_mtd_dev(ubi_devices[i]->ubi_num, 1); + mutex_unlock(&ubi_devices_mutex); + } + kmem_cache_destroy(ubi_wl_entry_slab); + misc_deregister(&ubi_ctrl_cdev); class_remove_file(ubi_class, &ubi_version); class_destroy(ubi_class); } module_exit(ubi_exit); /** - * bytes_str_to_int - convert a string representing number of bytes to an - * integer. + * bytes_str_to_int - convert a number of bytes string into an integer. * @str: the string to convert * * This function returns positive resulting integer in case of success and a @@ -754,7 +1113,8 @@ result = simple_strtoul(str, &endp, 0); if (str == endp || result < 0) { - printk("UBI error: incorrect bytes count: \"%s\"\n", str); + printk(KERN_ERR "UBI error: incorrect bytes count: \"%s\"\n", + str); return -EINVAL; } @@ -764,15 +1124,14 @@ case 'M': result *= 1024; case 'K': - case 'k': result *= 1024; - if (endp[1] == 'i' && (endp[2] == '\0' || - endp[2] == 'B' || endp[2] == 'b')) + if (endp[1] == 'i' && endp[2] == 'B') endp += 2; case '\0': break; default: - printk("UBI error: incorrect bytes count: \"%s\"\n", str); + printk(KERN_ERR "UBI error: incorrect bytes count: \"%s\"\n", + str); return -EINVAL; } @@ -793,23 +1152,27 @@ struct mtd_dev_param *p; char buf[MTD_PARAM_LEN_MAX]; char *pbuf = &buf[0]; - char *tokens[3] = {NULL, NULL, NULL}; + char *tokens[2] = {NULL, NULL}; + + if (!val) + return -EINVAL; if (mtd_devs == UBI_MAX_DEVICES) { - printk("UBI error: too many parameters, max. is %d\n", + printk(KERN_ERR "UBI error: too many parameters, max. is %d\n", UBI_MAX_DEVICES); return -EINVAL; } len = strnlen(val, MTD_PARAM_LEN_MAX); if (len == MTD_PARAM_LEN_MAX) { - printk("UBI error: parameter \"%s\" is too long, max. is %d\n", - val, MTD_PARAM_LEN_MAX); + printk(KERN_ERR "UBI error: parameter \"%s\" is too long, " + "max. is %d\n", val, MTD_PARAM_LEN_MAX); return -EINVAL; } if (len == 0) { - printk("UBI warning: empty 'mtd=' parameter - ignored\n"); + printk(KERN_WARNING "UBI warning: empty 'mtd=' parameter - " + "ignored\n"); return 0; } @@ -819,11 +1182,12 @@ if (buf[len - 1] == '\n') buf[len - 1] = '\0'; - for (i = 0; i < 3; i++) + for (i = 0; i < 2; i++) tokens[i] = strsep(&pbuf, ","); if (pbuf) { - printk("UBI error: too many arguments at \"%s\"\n", val); + printk(KERN_ERR "UBI error: too many arguments at \"%s\"\n", + val); return -EINVAL; } @@ -832,13 +1196,9 @@ if (tokens[1]) p->vid_hdr_offs = bytes_str_to_int(tokens[1]); - if (tokens[2]) - p->data_offs = bytes_str_to_int(tokens[2]); if (p->vid_hdr_offs < 0) return p->vid_hdr_offs; - if (p->data_offs < 0) - return p->data_offs; mtd_devs += 1; return 0; @@ -846,16 +1206,15 @@ module_param_call(mtd, ubi_mtd_param_parse, NULL, NULL, 000); MODULE_PARM_DESC(mtd, "MTD devices to attach. Parameter format: " - "mtd=[,,]. " + "mtd=[,].\n" "Multiple \"mtd\" parameters may be specified.\n" - "MTD devices may be specified by their number or name. " - "Optional \"vid_hdr_offs\" and \"data_offs\" parameters " - "specify UBI VID header position and data starting " - "position to be used by UBI.\n" - "Example: mtd=content,1984,2048 mtd=4 - attach MTD device" - "with name content using VID header offset 1984 and data " - "start 2048, and MTD device number 4 using default " - "offsets"); + "MTD devices may be specified by their number or name.\n" + "Optional \"vid_hdr_offs\" parameter specifies UBI VID " + "header position and data starting position to be used " + "by UBI.\n" + "Example: mtd=content,1984 mtd=4 - attach MTD device" + "with name \"content\" using VID header offset 1984, and " + "MTD device number 4 with default VID header offset."); MODULE_VERSION(__stringify(UBI_VERSION)); MODULE_DESCRIPTION("UBI - Unsorted Block Images"); diff -Nurd linux-2.6.24/drivers/mtd/ubi/cdev.c ubifs-v2.6.24/drivers/mtd/ubi/cdev.c --- linux-2.6.24/drivers/mtd/ubi/cdev.c 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/cdev.c 2009-04-07 17:14:47.000000000 +0200 @@ -28,45 +28,22 @@ * * Major and minor numbers are assigned dynamically to both UBI and volume * character devices. + * + * Well, there is the third kind of character devices - the UBI control + * character device, which allows to manipulate by UBI devices - create and + * delete them. In other words, it is used for attaching and detaching MTD + * devices. */ #include #include #include #include +#include +#include #include -#include -#include #include "ubi.h" -/* - * Maximum sequence numbers of UBI and volume character device IOCTLs (direct - * logical eraseblock erase is a debug-only feature). - */ -#define UBI_CDEV_IOC_MAX_SEQ 2 -#ifndef CONFIG_MTD_UBI_DEBUG_USERSPACE_IO -#define VOL_CDEV_IOC_MAX_SEQ 1 -#else -#define VOL_CDEV_IOC_MAX_SEQ 2 -#endif - -/** - * major_to_device - get UBI device object by character device major number. - * @major: major number - * - * This function returns a pointer to the UBI device object. - */ -static struct ubi_device *major_to_device(int major) -{ - int i; - - for (i = 0; i < ubi_devices_cnt; i++) - if (ubi_devices[i] && ubi_devices[i]->major == major) - return ubi_devices[i]; - BUG(); - return NULL; -} - /** * get_exclusive - get exclusive access to an UBI volume. * @desc: volume descriptor @@ -124,18 +101,20 @@ static int vol_cdev_open(struct inode *inode, struct file *file) { struct ubi_volume_desc *desc; - const struct ubi_device *ubi = major_to_device(imajor(inode)); - int vol_id = iminor(inode) - 1; - int mode; + int vol_id = iminor(inode) - 1, mode, ubi_num; + + ubi_num = ubi_major2num(imajor(inode)); + if (ubi_num < 0) + return ubi_num; if (file->f_mode & FMODE_WRITE) mode = UBI_READWRITE; else mode = UBI_READONLY; - dbg_msg("open volume %d, mode %d", vol_id, mode); + dbg_gen("open volume %d, mode %d", vol_id, mode); - desc = ubi_open_volume(ubi->ubi_num, vol_id, mode); + desc = ubi_open_volume(ubi_num, vol_id, mode); if (IS_ERR(desc)) return PTR_ERR(desc); @@ -148,13 +127,20 @@ struct ubi_volume_desc *desc = file->private_data; struct ubi_volume *vol = desc->vol; - dbg_msg("release volume %d, mode %d", vol->vol_id, desc->mode); + dbg_gen("release volume %d, mode %d", vol->vol_id, desc->mode); if (vol->updating) { ubi_warn("update of volume %d not finished, volume is damaged", vol->vol_id); + ubi_assert(!vol->changing_leb); vol->updating = 0; vfree(vol->upd_buf); + } else if (vol->changing_leb) { + dbg_gen("only %lld of %lld bytes received for atomic LEB change" + " for volume %d:%d, cancel", vol->upd_received, + vol->upd_bytes, vol->ubi->ubi_num, vol->vol_id); + vol->changing_leb = 0; + vfree(vol->upd_buf); } ubi_close_volume(desc); @@ -192,26 +178,35 @@ return -EINVAL; } - dbg_msg("seek volume %d, offset %lld, origin %d, new offset %lld", + dbg_gen("seek volume %d, offset %lld, origin %d, new offset %lld", vol->vol_id, offset, origin, new_offset); file->f_pos = new_offset; return new_offset; } +static int vol_cdev_fsync(struct file *file, struct dentry *dentry, + int datasync) +{ + struct ubi_volume_desc *desc = file->private_data; + struct ubi_device *ubi = desc->vol->ubi; + + return ubi_sync(ubi->ubi_num); +} + + static ssize_t vol_cdev_read(struct file *file, __user char *buf, size_t count, loff_t *offp) { struct ubi_volume_desc *desc = file->private_data; struct ubi_volume *vol = desc->vol; struct ubi_device *ubi = vol->ubi; - int err, lnum, off, len, vol_id = desc->vol->vol_id, tbuf_size; + int err, lnum, off, len, tbuf_size; size_t count_save = count; void *tbuf; - uint64_t tmp; - dbg_msg("read %zd bytes from offset %lld of volume %d", - count, *offp, vol_id); + dbg_gen("read %zd bytes from offset %lld of volume %d", + count, *offp, vol->vol_id); if (vol->updating) { dbg_err("updating"); @@ -225,7 +220,7 @@ return 0; if (vol->corrupted) - dbg_msg("read from corrupted volume %d", vol_id); + dbg_gen("read from corrupted volume %d", vol->vol_id); if (*offp + count > vol->used_bytes) count_save = count = vol->used_bytes - *offp; @@ -238,10 +233,7 @@ return -ENOMEM; len = count > tbuf_size ? tbuf_size : count; - - tmp = *offp; - off = do_div(tmp, vol->usable_leb_size); - lnum = tmp; + lnum = div_u64_rem(*offp, vol->usable_leb_size, &off); do { cond_resched(); @@ -249,7 +241,7 @@ if (off + len >= vol->usable_leb_size) len = vol->usable_leb_size - off; - err = ubi_eba_read_leb(ubi, vol_id, lnum, tbuf, off, len, 0); + err = ubi_eba_read_leb(ubi, vol, lnum, tbuf, off, len, 0); if (err) break; @@ -276,12 +268,9 @@ return err ? err : count_save - count; } -#ifdef CONFIG_MTD_UBI_DEBUG_USERSPACE_IO - /* * This function allows to directly write to dynamic UBI volumes, without - * issuing the volume update operation. Available only as a debugging feature. - * Very useful for testing UBI. + * issuing the volume update operation. */ static ssize_t vol_cdev_direct_write(struct file *file, const char __user *buf, size_t count, loff_t *offp) @@ -289,22 +278,21 @@ struct ubi_volume_desc *desc = file->private_data; struct ubi_volume *vol = desc->vol; struct ubi_device *ubi = vol->ubi; - int lnum, off, len, tbuf_size, vol_id = vol->vol_id, err = 0; + int lnum, off, len, tbuf_size, err = 0; size_t count_save = count; char *tbuf; - uint64_t tmp; - dbg_msg("requested: write %zd bytes to offset %lld of volume %u", - count, *offp, desc->vol->vol_id); + if (!vol->direct_writes) + return -EPERM; + + dbg_gen("requested: write %zd bytes to offset %lld of volume %u", + count, *offp, vol->vol_id); if (vol->vol_type == UBI_STATIC_VOLUME) return -EROFS; - tmp = *offp; - off = do_div(tmp, vol->usable_leb_size); - lnum = tmp; - - if (off % ubi->min_io_size) { + lnum = div_u64_rem(*offp, vol->usable_leb_size, &off); + if (off & (ubi->min_io_size - 1)) { dbg_err("unaligned position"); return -EINVAL; } @@ -313,7 +301,7 @@ count_save = count = vol->used_bytes - *offp; /* We can write only in fractions of the minimum I/O unit */ - if (count % ubi->min_io_size) { + if (count & (ubi->min_io_size - 1)) { dbg_err("unaligned write length"); return -EINVAL; } @@ -339,7 +327,7 @@ break; } - err = ubi_eba_write_leb(ubi, vol_id, lnum, tbuf, off, len, + err = ubi_eba_write_leb(ubi, vol, lnum, tbuf, off, len, UBI_UNKNOWN); if (err) break; @@ -360,10 +348,6 @@ return err ? err : count_save - count; } -#else -#define vol_cdev_direct_write(file, buf, count, offp) -EPERM -#endif /* CONFIG_MTD_UBI_DEBUG_USERSPACE_IO */ - static ssize_t vol_cdev_write(struct file *file, const char __user *buf, size_t count, loff_t *offp) { @@ -372,22 +356,32 @@ struct ubi_volume *vol = desc->vol; struct ubi_device *ubi = vol->ubi; - if (!vol->updating) + if (!vol->updating && !vol->changing_leb) return vol_cdev_direct_write(file, buf, count, offp); - err = ubi_more_update_data(ubi, vol->vol_id, buf, count); + if (vol->updating) + err = ubi_more_update_data(ubi, vol, buf, count); + else + err = ubi_more_leb_change_data(ubi, vol, buf, count); + if (err < 0) { - ubi_err("cannot write %zd bytes of update data", count); + ubi_err("cannot accept more %zd bytes of data, error %d", + count, err); return err; } if (err) { /* - * Update is finished, @err contains number of actually written - * bytes now. + * The operation is finished, @err contains number of actually + * written bytes. */ count = err; + if (vol->changing_leb) { + revoke_exclusive(desc, UBI_READWRITE); + return count; + } + err = ubi_check_volume(ubi, vol->vol_id); if (err < 0) return err; @@ -402,12 +396,11 @@ revoke_exclusive(desc, UBI_READWRITE); } - *offp += count; return count; } -static int vol_cdev_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) +static long vol_cdev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) { int err = 0; struct ubi_volume_desc *desc = file->private_data; @@ -437,7 +430,8 @@ break; } - rsvd_bytes = vol->reserved_pebs * (ubi->leb_size-vol->data_pad); + rsvd_bytes = (long long)vol->reserved_pebs * + ubi->leb_size-vol->data_pad; if (bytes < 0 || bytes > rsvd_bytes) { err = -EINVAL; break; @@ -447,15 +441,49 @@ if (err < 0) break; - err = ubi_start_update(ubi, vol->vol_id, bytes); + err = ubi_start_update(ubi, vol, bytes); if (bytes == 0) revoke_exclusive(desc, UBI_READWRITE); + break; + } - file->f_pos = 0; + /* Atomic logical eraseblock change command */ + case UBI_IOCEBCH: + { + struct ubi_leb_change_req req; + + err = copy_from_user(&req, argp, + sizeof(struct ubi_leb_change_req)); + if (err) { + err = -EFAULT; + break; + } + + if (desc->mode == UBI_READONLY || + vol->vol_type == UBI_STATIC_VOLUME) { + err = -EROFS; + break; + } + + /* Validate the request */ + err = -EINVAL; + if (req.lnum < 0 || req.lnum >= vol->reserved_pebs || + req.bytes < 0 || req.lnum >= vol->usable_leb_size) + break; + if (req.dtype != UBI_LONGTERM && req.dtype != UBI_SHORTTERM && + req.dtype != UBI_UNKNOWN) + break; + + err = get_exclusive(desc); + if (err < 0) + break; + + err = ubi_start_leb_change(ubi, vol, &req); + if (req.bytes == 0) + revoke_exclusive(desc, UBI_READWRITE); break; } -#ifdef CONFIG_MTD_UBI_DEBUG_USERSPACE_IO /* Logical eraseblock erasure command */ case UBI_IOCEBER: { @@ -467,7 +495,8 @@ break; } - if (desc->mode == UBI_READONLY) { + if (desc->mode == UBI_READONLY || + vol->vol_type == UBI_STATIC_VOLUME) { err = -EROFS; break; } @@ -477,26 +506,85 @@ break; } - if (vol->vol_type != UBI_DYNAMIC_VOLUME) { - err = -EROFS; + dbg_gen("erase LEB %d:%d", vol->vol_id, lnum); + err = ubi_eba_unmap_leb(ubi, vol, lnum); + if (err) + break; + + err = ubi_wl_flush(ubi); + break; + } + + /* Logical eraseblock map command */ + case UBI_IOCEBMAP: + { + struct ubi_map_req req; + + err = copy_from_user(&req, argp, sizeof(struct ubi_map_req)); + if (err) { + err = -EFAULT; break; } + err = ubi_leb_map(desc, req.lnum, req.dtype); + break; + } - dbg_msg("erase LEB %d:%d", vol->vol_id, lnum); - err = ubi_eba_unmap_leb(ubi, vol->vol_id, lnum); - if (err) + /* Logical eraseblock un-map command */ + case UBI_IOCEBUNMAP: + { + int32_t lnum; + + err = get_user(lnum, (__user int32_t *)argp); + if (err) { + err = -EFAULT; break; + } + err = ubi_leb_unmap(desc, lnum); + break; + } - err = ubi_wl_flush(ubi); + /* Check if logical eraseblock is mapped command */ + case UBI_IOCEBISMAP: + { + int32_t lnum; + + err = get_user(lnum, (__user int32_t *)argp); + if (err) { + err = -EFAULT; + break; + } + err = ubi_is_mapped(desc, lnum); + break; + } + + /* Set volume property command*/ + case UBI_IOCSETPROP: + { + struct ubi_set_prop_req req; + + err = copy_from_user(&req, argp, + sizeof(struct ubi_set_prop_req)); + if (err) { + err = -EFAULT; + break; + } + switch (req.property) { + case UBI_PROP_DIRECT_WRITE: + mutex_lock(&ubi->volumes_mutex); + desc->vol->direct_writes = !!req.value; + mutex_unlock(&ubi->volumes_mutex); + break; + default: + err = -EINVAL; + break; + } break; } -#endif default: err = -ENOTTY; break; } - return err; } @@ -533,7 +621,7 @@ if (req->alignment > ubi->leb_size) goto bad; - n = req->alignment % ubi->min_io_size; + n = req->alignment & (ubi->min_io_size - 1); if (req->alignment != 1 && n) goto bad; @@ -542,6 +630,10 @@ goto bad; } + n = strnlen(req->name, req->name_len + 1); + if (n != req->name_len) + goto bad; + return 0; bad: @@ -569,8 +661,169 @@ return 0; } -static int ubi_cdev_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) +/** + * rename_volumes - rename UBI volumes. + * @ubi: UBI device description object + * @req: volumes re-name request + * + * This is a helper function for the volume re-name IOCTL which validates the + * the request, opens the volume and calls corresponding volumes management + * function. Returns zero in case of success and a negative error code in case + * of failure. + */ +static int rename_volumes(struct ubi_device *ubi, + struct ubi_rnvol_req *req) +{ + int i, n, err; + struct list_head rename_list; + struct ubi_rename_entry *re, *re1; + + if (req->count < 0 || req->count > UBI_MAX_RNVOL) + return -EINVAL; + + if (req->count == 0) + return 0; + + /* Validate volume IDs and names in the request */ + for (i = 0; i < req->count; i++) { + if (req->ents[i].vol_id < 0 || + req->ents[i].vol_id >= ubi->vtbl_slots) + return -EINVAL; + if (req->ents[i].name_len < 0) + return -EINVAL; + if (req->ents[i].name_len > UBI_VOL_NAME_MAX) + return -ENAMETOOLONG; + req->ents[i].name[req->ents[i].name_len] = '\0'; + n = strlen(req->ents[i].name); + if (n != req->ents[i].name_len) + err = -EINVAL; + } + + /* Make sure volume IDs and names are unique */ + for (i = 0; i < req->count - 1; i++) { + for (n = i + 1; n < req->count; n++) { + if (req->ents[i].vol_id == req->ents[n].vol_id) { + dbg_err("duplicated volume id %d", + req->ents[i].vol_id); + return -EINVAL; + } + if (!strcmp(req->ents[i].name, req->ents[n].name)) { + dbg_err("duplicated volume name \"%s\"", + req->ents[i].name); + return -EINVAL; + } + } + } + + /* Create the re-name list */ + INIT_LIST_HEAD(&rename_list); + for (i = 0; i < req->count; i++) { + int vol_id = req->ents[i].vol_id; + int name_len = req->ents[i].name_len; + const char *name = req->ents[i].name; + + re = kzalloc(sizeof(struct ubi_rename_entry), GFP_KERNEL); + if (!re) { + err = -ENOMEM; + goto out_free; + } + + re->desc = ubi_open_volume(ubi->ubi_num, vol_id, UBI_EXCLUSIVE); + if (IS_ERR(re->desc)) { + err = PTR_ERR(re->desc); + dbg_err("cannot open volume %d, error %d", vol_id, err); + kfree(re); + goto out_free; + } + + /* Skip this re-naming if the name does not really change */ + if (re->desc->vol->name_len == name_len && + !memcmp(re->desc->vol->name, name, name_len)) { + ubi_close_volume(re->desc); + kfree(re); + continue; + } + + re->new_name_len = name_len; + memcpy(re->new_name, name, name_len); + list_add_tail(&re->list, &rename_list); + dbg_msg("will rename volume %d from \"%s\" to \"%s\"", + vol_id, re->desc->vol->name, name); + } + + if (list_empty(&rename_list)) + return 0; + + /* Find out the volumes which have to be removed */ + list_for_each_entry(re, &rename_list, list) { + struct ubi_volume_desc *desc; + int no_remove_needed = 0; + + /* + * Volume @re->vol_id is going to be re-named to + * @re->new_name, while its current name is @name. If a volume + * with name @re->new_name currently exists, it has to be + * removed, unless it is also re-named in the request (@req). + */ + list_for_each_entry(re1, &rename_list, list) { + if (re->new_name_len == re1->desc->vol->name_len && + !memcmp(re->new_name, re1->desc->vol->name, + re1->desc->vol->name_len)) { + no_remove_needed = 1; + break; + } + } + + if (no_remove_needed) + continue; + + /* + * It seems we need to remove volume with name @re->new_name, + * if it exists. + */ + desc = ubi_open_volume_nm(ubi->ubi_num, re->new_name, + UBI_EXCLUSIVE); + if (IS_ERR(desc)) { + err = PTR_ERR(desc); + if (err == -ENODEV) + /* Re-naming into a non-existing volume name */ + continue; + + /* The volume exists but busy, or an error occurred */ + dbg_err("cannot open volume \"%s\", error %d", + re->new_name, err); + goto out_free; + } + + re = kzalloc(sizeof(struct ubi_rename_entry), GFP_KERNEL); + if (!re) { + err = -ENOMEM; + ubi_close_volume(desc); + goto out_free; + } + + re->remove = 1; + re->desc = desc; + list_add(&re->list, &rename_list); + dbg_msg("will remove volume %d, name \"%s\"", + re->desc->vol->vol_id, re->desc->vol->name); + } + + mutex_lock(&ubi->volumes_mutex); + err = ubi_rename_volumes(ubi, &rename_list); + mutex_unlock(&ubi->volumes_mutex); + +out_free: + list_for_each_entry_safe(re, re1, &rename_list, list) { + ubi_close_volume(re->desc); + list_del(&re->list); + kfree(re); + } + return err; +} + +static long ubi_cdev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) { int err = 0; struct ubi_device *ubi; @@ -580,9 +833,9 @@ if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - ubi = major_to_device(imajor(inode)); - if (IS_ERR(ubi)) - return PTR_ERR(ubi); + ubi = ubi_get_by_major(imajor(file->f_mapping->host)); + if (!ubi) + return -ENODEV; switch (cmd) { /* Create volume command */ @@ -590,21 +843,21 @@ { struct ubi_mkvol_req req; - dbg_msg("create volume"); - err = copy_from_user(&req, argp, - sizeof(struct ubi_mkvol_req)); + dbg_gen("create volume"); + err = copy_from_user(&req, argp, sizeof(struct ubi_mkvol_req)); if (err) { err = -EFAULT; break; } + req.name[req.name_len] = '\0'; err = verify_mkvol_req(ubi, &req); if (err) break; - req.name[req.name_len] = '\0'; - + mutex_lock(&ubi->volumes_mutex); err = ubi_create_volume(ubi, &req); + mutex_unlock(&ubi->volumes_mutex); if (err) break; @@ -620,7 +873,7 @@ { int vol_id; - dbg_msg("remove volume"); + dbg_gen("remove volume"); err = get_user(vol_id, (__user int32_t *)argp); if (err) { err = -EFAULT; @@ -633,10 +886,16 @@ break; } - err = ubi_remove_volume(desc); - if (err) - ubi_close_volume(desc); + mutex_lock(&ubi->volumes_mutex); + err = ubi_remove_volume(desc, 0); + mutex_unlock(&ubi->volumes_mutex); + /* + * The volume is deleted (unless an error occurred), and the + * 'struct ubi_volume' object will be freed when + * 'ubi_close_volume()' will call 'put_device()'. + */ + ubi_close_volume(desc); break; } @@ -644,12 +903,10 @@ case UBI_IOCRSVOL: { int pebs; - uint64_t tmp; struct ubi_rsvol_req req; - dbg_msg("re-size volume"); - err = copy_from_user(&req, argp, - sizeof(struct ubi_rsvol_req)); + dbg_gen("re-size volume"); + err = copy_from_user(&req, argp, sizeof(struct ubi_rsvol_req)); if (err) { err = -EFAULT; break; @@ -665,37 +922,182 @@ break; } - tmp = req.bytes; - pebs = !!do_div(tmp, desc->vol->usable_leb_size); - pebs += tmp; + pebs = div_u64(req.bytes + desc->vol->usable_leb_size - 1, + desc->vol->usable_leb_size); + mutex_lock(&ubi->volumes_mutex); err = ubi_resize_volume(desc, pebs); + mutex_unlock(&ubi->volumes_mutex); ubi_close_volume(desc); break; } + /* Re-name volumes command */ + case UBI_IOCRNVOL: + { + struct ubi_rnvol_req *req; + + dbg_msg("re-name volumes"); + req = kmalloc(sizeof(struct ubi_rnvol_req), GFP_KERNEL); + if (!req) { + err = -ENOMEM; + break; + }; + + err = copy_from_user(req, argp, sizeof(struct ubi_rnvol_req)); + if (err) { + err = -EFAULT; + kfree(req); + break; + } + + mutex_lock(&ubi->mult_mutex); + err = rename_volumes(ubi, req); + mutex_unlock(&ubi->mult_mutex); + kfree(req); + break; + } + default: err = -ENOTTY; break; } + ubi_put_device(ubi); return err; } +static long ctrl_cdev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int err = 0; + void __user *argp = (void __user *)arg; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + switch (cmd) { + /* Attach an MTD device command */ + case UBI_IOCATT: + { + struct ubi_attach_req req; + struct mtd_info *mtd; + + dbg_gen("attach MTD device"); + err = copy_from_user(&req, argp, sizeof(struct ubi_attach_req)); + if (err) { + err = -EFAULT; + break; + } + + if (req.mtd_num < 0 || + (req.ubi_num < 0 && req.ubi_num != UBI_DEV_NUM_AUTO)) { + err = -EINVAL; + break; + } + + mtd = get_mtd_device(NULL, req.mtd_num); + if (IS_ERR(mtd)) { + err = PTR_ERR(mtd); + break; + } + + /* + * Note, further request verification is done by + * 'ubi_attach_mtd_dev()'. + */ + mutex_lock(&ubi_devices_mutex); + err = ubi_attach_mtd_dev(mtd, req.ubi_num, req.vid_hdr_offset); + mutex_unlock(&ubi_devices_mutex); + if (err < 0) + put_mtd_device(mtd); + else + /* @err contains UBI device number */ + err = put_user(err, (__user int32_t *)argp); + + break; + } + + /* Detach an MTD device command */ + case UBI_IOCDET: + { + int ubi_num; + + dbg_gen("dettach MTD device"); + err = get_user(ubi_num, (__user int32_t *)argp); + if (err) { + err = -EFAULT; + break; + } + + mutex_lock(&ubi_devices_mutex); + err = ubi_detach_mtd_dev(ubi_num, 0); + mutex_unlock(&ubi_devices_mutex); + break; + } + + default: + err = -ENOTTY; + break; + } + + return err; +} + +#ifdef CONFIG_COMPAT +static long vol_cdev_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + unsigned long translated_arg = (unsigned long)compat_ptr(arg); + + return vol_cdev_ioctl(file, cmd, translated_arg); +} + +static long ubi_cdev_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + unsigned long translated_arg = (unsigned long)compat_ptr(arg); + + return ubi_cdev_ioctl(file, cmd, translated_arg); +} + +static long ctrl_cdev_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + unsigned long translated_arg = (unsigned long)compat_ptr(arg); + + return ctrl_cdev_ioctl(file, cmd, translated_arg); +} +#else +#define vol_cdev_compat_ioctl NULL +#define ubi_cdev_compat_ioctl NULL +#define ctrl_cdev_compat_ioctl NULL +#endif + +/* UBI volume character device operations */ +const struct file_operations ubi_vol_cdev_operations = { + .owner = THIS_MODULE, + .open = vol_cdev_open, + .release = vol_cdev_release, + .llseek = vol_cdev_llseek, + .read = vol_cdev_read, + .write = vol_cdev_write, + .fsync = vol_cdev_fsync, + .unlocked_ioctl = vol_cdev_ioctl, + .compat_ioctl = vol_cdev_compat_ioctl, +}; + /* UBI character device operations */ -struct file_operations ubi_cdev_operations = { - .owner = THIS_MODULE, - .ioctl = ubi_cdev_ioctl, - .llseek = no_llseek, +const struct file_operations ubi_cdev_operations = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .unlocked_ioctl = ubi_cdev_ioctl, + .compat_ioctl = ubi_cdev_compat_ioctl, }; -/* UBI volume character device operations */ -struct file_operations ubi_vol_cdev_operations = { - .owner = THIS_MODULE, - .open = vol_cdev_open, - .release = vol_cdev_release, - .llseek = vol_cdev_llseek, - .read = vol_cdev_read, - .write = vol_cdev_write, - .ioctl = vol_cdev_ioctl, +/* UBI control character device operations */ +const struct file_operations ubi_ctrl_cdev_operations = { + .owner = THIS_MODULE, + .unlocked_ioctl = ctrl_cdev_ioctl, + .compat_ioctl = ctrl_cdev_compat_ioctl, }; diff -Nurd linux-2.6.24/drivers/mtd/ubi/debug.c ubifs-v2.6.24/drivers/mtd/ubi/debug.c --- linux-2.6.24/drivers/mtd/ubi/debug.c 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/debug.c 2009-04-07 17:14:47.000000000 +0200 @@ -24,7 +24,7 @@ * changes. */ -#ifdef CONFIG_MTD_UBI_DEBUG_MSG +#ifdef CONFIG_MTD_UBI_DEBUG #include "ubi.h" @@ -34,14 +34,19 @@ */ void ubi_dbg_dump_ec_hdr(const struct ubi_ec_hdr *ec_hdr) { - dbg_msg("erase counter header dump:"); - dbg_msg("magic %#08x", be32_to_cpu(ec_hdr->magic)); - dbg_msg("version %d", (int)ec_hdr->version); - dbg_msg("ec %llu", (long long)be64_to_cpu(ec_hdr->ec)); - dbg_msg("vid_hdr_offset %d", be32_to_cpu(ec_hdr->vid_hdr_offset)); - dbg_msg("data_offset %d", be32_to_cpu(ec_hdr->data_offset)); - dbg_msg("hdr_crc %#08x", be32_to_cpu(ec_hdr->hdr_crc)); - dbg_msg("erase counter header hexdump:"); + printk(KERN_DEBUG "Erase counter header dump:\n"); + printk(KERN_DEBUG "\tmagic %#08x\n", + be32_to_cpu(ec_hdr->magic)); + printk(KERN_DEBUG "\tversion %d\n", (int)ec_hdr->version); + printk(KERN_DEBUG "\tec %llu\n", + (long long)be64_to_cpu(ec_hdr->ec)); + printk(KERN_DEBUG "\tvid_hdr_offset %d\n", + be32_to_cpu(ec_hdr->vid_hdr_offset)); + printk(KERN_DEBUG "\tdata_offset %d\n", + be32_to_cpu(ec_hdr->data_offset)); + printk(KERN_DEBUG "\thdr_crc %#08x\n", + be32_to_cpu(ec_hdr->hdr_crc)); + printk(KERN_DEBUG "erase counter header hexdump:\n"); print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, ec_hdr, UBI_EC_HDR_SIZE, 1); } @@ -52,22 +57,23 @@ */ void ubi_dbg_dump_vid_hdr(const struct ubi_vid_hdr *vid_hdr) { - dbg_msg("volume identifier header dump:"); - dbg_msg("magic %08x", be32_to_cpu(vid_hdr->magic)); - dbg_msg("version %d", (int)vid_hdr->version); - dbg_msg("vol_type %d", (int)vid_hdr->vol_type); - dbg_msg("copy_flag %d", (int)vid_hdr->copy_flag); - dbg_msg("compat %d", (int)vid_hdr->compat); - dbg_msg("vol_id %d", be32_to_cpu(vid_hdr->vol_id)); - dbg_msg("lnum %d", be32_to_cpu(vid_hdr->lnum)); - dbg_msg("leb_ver %u", be32_to_cpu(vid_hdr->leb_ver)); - dbg_msg("data_size %d", be32_to_cpu(vid_hdr->data_size)); - dbg_msg("used_ebs %d", be32_to_cpu(vid_hdr->used_ebs)); - dbg_msg("data_pad %d", be32_to_cpu(vid_hdr->data_pad)); - dbg_msg("sqnum %llu", + printk(KERN_DEBUG "Volume identifier header dump:\n"); + printk(KERN_DEBUG "\tmagic %08x\n", be32_to_cpu(vid_hdr->magic)); + printk(KERN_DEBUG "\tversion %d\n", (int)vid_hdr->version); + printk(KERN_DEBUG "\tvol_type %d\n", (int)vid_hdr->vol_type); + printk(KERN_DEBUG "\tcopy_flag %d\n", (int)vid_hdr->copy_flag); + printk(KERN_DEBUG "\tcompat %d\n", (int)vid_hdr->compat); + printk(KERN_DEBUG "\tvol_id %d\n", be32_to_cpu(vid_hdr->vol_id)); + printk(KERN_DEBUG "\tlnum %d\n", be32_to_cpu(vid_hdr->lnum)); + printk(KERN_DEBUG "\tdata_size %d\n", be32_to_cpu(vid_hdr->data_size)); + printk(KERN_DEBUG "\tused_ebs %d\n", be32_to_cpu(vid_hdr->used_ebs)); + printk(KERN_DEBUG "\tdata_pad %d\n", be32_to_cpu(vid_hdr->data_pad)); + printk(KERN_DEBUG "\tsqnum %llu\n", (unsigned long long)be64_to_cpu(vid_hdr->sqnum)); - dbg_msg("hdr_crc %08x", be32_to_cpu(vid_hdr->hdr_crc)); - dbg_msg("volume identifier header hexdump:"); + printk(KERN_DEBUG "\thdr_crc %08x\n", be32_to_cpu(vid_hdr->hdr_crc)); + printk(KERN_DEBUG "Volume identifier header hexdump:\n"); + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, + vid_hdr, UBI_VID_HDR_SIZE, 1); } /** @@ -76,27 +82,27 @@ */ void ubi_dbg_dump_vol_info(const struct ubi_volume *vol) { - dbg_msg("volume information dump:"); - dbg_msg("vol_id %d", vol->vol_id); - dbg_msg("reserved_pebs %d", vol->reserved_pebs); - dbg_msg("alignment %d", vol->alignment); - dbg_msg("data_pad %d", vol->data_pad); - dbg_msg("vol_type %d", vol->vol_type); - dbg_msg("name_len %d", vol->name_len); - dbg_msg("usable_leb_size %d", vol->usable_leb_size); - dbg_msg("used_ebs %d", vol->used_ebs); - dbg_msg("used_bytes %lld", vol->used_bytes); - dbg_msg("last_eb_bytes %d", vol->last_eb_bytes); - dbg_msg("corrupted %d", vol->corrupted); - dbg_msg("upd_marker %d", vol->upd_marker); + printk(KERN_DEBUG "Volume information dump:\n"); + printk(KERN_DEBUG "\tvol_id %d\n", vol->vol_id); + printk(KERN_DEBUG "\treserved_pebs %d\n", vol->reserved_pebs); + printk(KERN_DEBUG "\talignment %d\n", vol->alignment); + printk(KERN_DEBUG "\tdata_pad %d\n", vol->data_pad); + printk(KERN_DEBUG "\tvol_type %d\n", vol->vol_type); + printk(KERN_DEBUG "\tname_len %d\n", vol->name_len); + printk(KERN_DEBUG "\tusable_leb_size %d\n", vol->usable_leb_size); + printk(KERN_DEBUG "\tused_ebs %d\n", vol->used_ebs); + printk(KERN_DEBUG "\tused_bytes %lld\n", vol->used_bytes); + printk(KERN_DEBUG "\tlast_eb_bytes %d\n", vol->last_eb_bytes); + printk(KERN_DEBUG "\tcorrupted %d\n", vol->corrupted); + printk(KERN_DEBUG "\tupd_marker %d\n", vol->upd_marker); if (vol->name_len <= UBI_VOL_NAME_MAX && strnlen(vol->name, vol->name_len + 1) == vol->name_len) { - dbg_msg("name %s", vol->name); + printk(KERN_DEBUG "\tname %s\n", vol->name); } else { - dbg_msg("the 1st 5 characters of the name: %c%c%c%c%c", - vol->name[0], vol->name[1], vol->name[2], - vol->name[3], vol->name[4]); + printk(KERN_DEBUG "\t1st 5 characters of name: %c%c%c%c%c\n", + vol->name[0], vol->name[1], vol->name[2], + vol->name[3], vol->name[4]); } } @@ -109,28 +115,29 @@ { int name_len = be16_to_cpu(r->name_len); - dbg_msg("volume table record %d dump:", idx); - dbg_msg("reserved_pebs %d", be32_to_cpu(r->reserved_pebs)); - dbg_msg("alignment %d", be32_to_cpu(r->alignment)); - dbg_msg("data_pad %d", be32_to_cpu(r->data_pad)); - dbg_msg("vol_type %d", (int)r->vol_type); - dbg_msg("upd_marker %d", (int)r->upd_marker); - dbg_msg("name_len %d", name_len); + printk(KERN_DEBUG "Volume table record %d dump:\n", idx); + printk(KERN_DEBUG "\treserved_pebs %d\n", + be32_to_cpu(r->reserved_pebs)); + printk(KERN_DEBUG "\talignment %d\n", be32_to_cpu(r->alignment)); + printk(KERN_DEBUG "\tdata_pad %d\n", be32_to_cpu(r->data_pad)); + printk(KERN_DEBUG "\tvol_type %d\n", (int)r->vol_type); + printk(KERN_DEBUG "\tupd_marker %d\n", (int)r->upd_marker); + printk(KERN_DEBUG "\tname_len %d\n", name_len); if (r->name[0] == '\0') { - dbg_msg("name NULL"); + printk(KERN_DEBUG "\tname NULL\n"); return; } if (name_len <= UBI_VOL_NAME_MAX && strnlen(&r->name[0], name_len + 1) == name_len) { - dbg_msg("name %s", &r->name[0]); + printk(KERN_DEBUG "\tname %s\n", &r->name[0]); } else { - dbg_msg("1st 5 characters of the name: %c%c%c%c%c", + printk(KERN_DEBUG "\t1st 5 characters of name: %c%c%c%c%c\n", r->name[0], r->name[1], r->name[2], r->name[3], r->name[4]); } - dbg_msg("crc %#08x", be32_to_cpu(r->crc)); + printk(KERN_DEBUG "\tcrc %#08x\n", be32_to_cpu(r->crc)); } /** @@ -139,15 +146,15 @@ */ void ubi_dbg_dump_sv(const struct ubi_scan_volume *sv) { - dbg_msg("volume scanning information dump:"); - dbg_msg("vol_id %d", sv->vol_id); - dbg_msg("highest_lnum %d", sv->highest_lnum); - dbg_msg("leb_count %d", sv->leb_count); - dbg_msg("compat %d", sv->compat); - dbg_msg("vol_type %d", sv->vol_type); - dbg_msg("used_ebs %d", sv->used_ebs); - dbg_msg("last_data_size %d", sv->last_data_size); - dbg_msg("data_pad %d", sv->data_pad); + printk(KERN_DEBUG "Volume scanning information dump:\n"); + printk(KERN_DEBUG "\tvol_id %d\n", sv->vol_id); + printk(KERN_DEBUG "\thighest_lnum %d\n", sv->highest_lnum); + printk(KERN_DEBUG "\tleb_count %d\n", sv->leb_count); + printk(KERN_DEBUG "\tcompat %d\n", sv->compat); + printk(KERN_DEBUG "\tvol_type %d\n", sv->vol_type); + printk(KERN_DEBUG "\tused_ebs %d\n", sv->used_ebs); + printk(KERN_DEBUG "\tlast_data_size %d\n", sv->last_data_size); + printk(KERN_DEBUG "\tdata_pad %d\n", sv->data_pad); } /** @@ -157,14 +164,13 @@ */ void ubi_dbg_dump_seb(const struct ubi_scan_leb *seb, int type) { - dbg_msg("eraseblock scanning information dump:"); - dbg_msg("ec %d", seb->ec); - dbg_msg("pnum %d", seb->pnum); + printk(KERN_DEBUG "eraseblock scanning information dump:\n"); + printk(KERN_DEBUG "\tec %d\n", seb->ec); + printk(KERN_DEBUG "\tpnum %d\n", seb->pnum); if (type == 0) { - dbg_msg("lnum %d", seb->lnum); - dbg_msg("scrub %d", seb->scrub); - dbg_msg("sqnum %llu", seb->sqnum); - dbg_msg("leb_ver %u", seb->leb_ver); + printk(KERN_DEBUG "\tlnum %d\n", seb->lnum); + printk(KERN_DEBUG "\tscrub %d\n", seb->scrub); + printk(KERN_DEBUG "\tsqnum %llu\n", seb->sqnum); } } @@ -176,16 +182,16 @@ { char nm[17]; - dbg_msg("volume creation request dump:"); - dbg_msg("vol_id %d", req->vol_id); - dbg_msg("alignment %d", req->alignment); - dbg_msg("bytes %lld", (long long)req->bytes); - dbg_msg("vol_type %d", req->vol_type); - dbg_msg("name_len %d", req->name_len); + printk(KERN_DEBUG "Volume creation request dump:\n"); + printk(KERN_DEBUG "\tvol_id %d\n", req->vol_id); + printk(KERN_DEBUG "\talignment %d\n", req->alignment); + printk(KERN_DEBUG "\tbytes %lld\n", (long long)req->bytes); + printk(KERN_DEBUG "\tvol_type %d\n", req->vol_type); + printk(KERN_DEBUG "\tname_len %d\n", req->name_len); memcpy(nm, req->name, 16); nm[16] = 0; - dbg_msg("the 1st 16 characters of the name: %s", nm); + printk(KERN_DEBUG "\t1st 16 characters of name: %s\n", nm); } -#endif /* CONFIG_MTD_UBI_DEBUG_MSG */ +#endif /* CONFIG_MTD_UBI_DEBUG */ diff -Nurd linux-2.6.24/drivers/mtd/ubi/debug.h ubifs-v2.6.24/drivers/mtd/ubi/debug.h --- linux-2.6.24/drivers/mtd/ubi/debug.h 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/debug.h 2009-04-07 17:14:47.000000000 +0200 @@ -24,23 +24,19 @@ #ifdef CONFIG_MTD_UBI_DEBUG #include -#define ubi_assert(expr) BUG_ON(!(expr)) #define dbg_err(fmt, ...) ubi_err(fmt, ##__VA_ARGS__) -#else -#define ubi_assert(expr) ({}) -#define dbg_err(fmt, ...) ({}) -#endif -#ifdef CONFIG_MTD_UBI_DEBUG_DISABLE_BGT -#define DBG_DISABLE_BGT 1 -#else -#define DBG_DISABLE_BGT 0 -#endif +#define ubi_assert(expr) do { \ + if (unlikely(!(expr))) { \ + printk(KERN_CRIT "UBI assert failed in %s at %u (pid %d)\n", \ + __func__, __LINE__, current->pid); \ + ubi_dbg_dump_stack(); \ + } \ +} while (0) -#ifdef CONFIG_MTD_UBI_DEBUG_MSG -/* Generic debugging message */ -#define dbg_msg(fmt, ...) \ - printk(KERN_DEBUG "UBI DBG: %s: " fmt "\n", __FUNCTION__, ##__VA_ARGS__) +#define dbg_msg(fmt, ...) \ + printk(KERN_DEBUG "UBI DBG (pid %d): %s: " fmt "\n", \ + current->pid, __FUNCTION__, ##__VA_ARGS__) #define ubi_dbg_dump_stack() dump_stack() @@ -60,54 +56,47 @@ void ubi_dbg_dump_seb(const struct ubi_scan_leb *seb, int type); void ubi_dbg_dump_mkvol_req(const struct ubi_mkvol_req *req); +#ifdef CONFIG_MTD_UBI_DEBUG_MSG +/* General debugging messages */ +#define dbg_gen(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) #else - -#define dbg_msg(fmt, ...) ({}) -#define ubi_dbg_dump_stack() ({}) -#define ubi_dbg_dump_ec_hdr(ec_hdr) ({}) -#define ubi_dbg_dump_vid_hdr(vid_hdr) ({}) -#define ubi_dbg_dump_vol_info(vol) ({}) -#define ubi_dbg_dump_vtbl_record(r, idx) ({}) -#define ubi_dbg_dump_sv(sv) ({}) -#define ubi_dbg_dump_seb(seb, type) ({}) -#define ubi_dbg_dump_mkvol_req(req) ({}) - -#endif /* CONFIG_MTD_UBI_DEBUG_MSG */ +#define dbg_gen(fmt, ...) ({}) +#endif #ifdef CONFIG_MTD_UBI_DEBUG_MSG_EBA -/* Messages from the eraseblock association unit */ -#define dbg_eba(fmt, ...) \ - printk(KERN_DEBUG "UBI DBG eba: %s: " fmt "\n", __FUNCTION__, \ - ##__VA_ARGS__) +/* Messages from the eraseblock association sub-system */ +#define dbg_eba(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) #else #define dbg_eba(fmt, ...) ({}) #endif #ifdef CONFIG_MTD_UBI_DEBUG_MSG_WL -/* Messages from the wear-leveling unit */ -#define dbg_wl(fmt, ...) \ - printk(KERN_DEBUG "UBI DBG wl: %s: " fmt "\n", __FUNCTION__, \ - ##__VA_ARGS__) +/* Messages from the wear-leveling sub-system */ +#define dbg_wl(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) #else #define dbg_wl(fmt, ...) ({}) #endif #ifdef CONFIG_MTD_UBI_DEBUG_MSG_IO -/* Messages from the input/output unit */ -#define dbg_io(fmt, ...) \ - printk(KERN_DEBUG "UBI DBG io: %s: " fmt "\n", __FUNCTION__, \ - ##__VA_ARGS__) +/* Messages from the input/output sub-system */ +#define dbg_io(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) #else #define dbg_io(fmt, ...) ({}) #endif #ifdef CONFIG_MTD_UBI_DEBUG_MSG_BLD /* Initialization and build messages */ -#define dbg_bld(fmt, ...) \ - printk(KERN_DEBUG "UBI DBG bld: %s: " fmt "\n", __FUNCTION__, \ - ##__VA_ARGS__) +#define dbg_bld(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define UBI_IO_DEBUG 1 #else #define dbg_bld(fmt, ...) ({}) +#define UBI_IO_DEBUG 0 +#endif + +#ifdef CONFIG_MTD_UBI_DEBUG_DISABLE_BGT +#define DBG_DISABLE_BGT 1 +#else +#define DBG_DISABLE_BGT 0 #endif #ifdef CONFIG_MTD_UBI_DEBUG_EMULATE_BITFLIPS @@ -154,4 +143,50 @@ #define ubi_dbg_is_erase_failure() 0 #endif +#else + +#define ubi_assert(expr) ({}) +#define dbg_err(fmt, ...) ({}) +#define dbg_msg(fmt, ...) ({}) +#define dbg_gen(fmt, ...) ({}) +#define dbg_eba(fmt, ...) ({}) +#define dbg_wl(fmt, ...) ({}) +#define dbg_io(fmt, ...) ({}) +#define dbg_bld(fmt, ...) ({}) +#define ubi_dbg_dump_stack() ({}) +#define ubi_dbg_dump_ec_hdr(ec_hdr) ({}) +#define ubi_dbg_dump_vid_hdr(vid_hdr) ({}) +#define ubi_dbg_dump_vol_info(vol) ({}) +#define ubi_dbg_dump_vtbl_record(r, idx) ({}) +#define ubi_dbg_dump_sv(sv) ({}) +#define ubi_dbg_dump_seb(seb, type) ({}) +#define ubi_dbg_dump_mkvol_req(req) ({}) + +#define UBI_IO_DEBUG 0 +#define DBG_DISABLE_BGT 0 +#define ubi_dbg_is_bitflip() 0 +#define ubi_dbg_is_write_failure() 0 +#define ubi_dbg_is_erase_failure() 0 + +#endif /* !CONFIG_MTD_UBI_DEBUG */ + +/* + * Some compatibility stuff goes here. + */ + +#include + +static inline uint64_t div_u64(uint64_t dividend, uint64_t divisor) +{ + do_div(dividend, divisor); + return dividend; +} + +static inline uint64_t div_u64_rem(uint64_t dividend, uint32_t divisor, + uint32_t *remainder) +{ + *remainder = do_div(dividend, divisor); + return dividend; +} + #endif /* !__UBI_DEBUG_H__ */ diff -Nurd linux-2.6.24/drivers/mtd/ubi/eba.c ubifs-v2.6.24/drivers/mtd/ubi/eba.c --- linux-2.6.24/drivers/mtd/ubi/eba.c 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/eba.c 2009-04-07 17:14:47.000000000 +0200 @@ -19,20 +19,20 @@ */ /* - * The UBI Eraseblock Association (EBA) unit. + * The UBI Eraseblock Association (EBA) sub-system. * - * This unit is responsible for I/O to/from logical eraseblock. + * This sub-system is responsible for I/O to/from logical eraseblock. * * Although in this implementation the EBA table is fully kept and managed in * RAM, which assumes poor scalability, it might be (partially) maintained on * flash in future implementations. * - * The EBA unit implements per-logical eraseblock locking. Before accessing a - * logical eraseblock it is locked for reading or writing. The per-logical - * eraseblock locking is implemented by means of the lock tree. The lock tree - * is an RB-tree which refers all the currently locked logical eraseblocks. The - * lock tree elements are &struct ltree_entry objects. They are indexed by - * (@vol_id, @lnum) pairs. + * The EBA sub-system implements per-logical eraseblock locking. Before + * accessing a logical eraseblock it is locked for reading or writing. The + * per-logical eraseblock locking is implemented by means of the lock tree. The + * lock tree is an RB-tree which refers all the currently locked logical + * eraseblocks. The lock tree elements are &struct ubi_ltree_entry objects. + * They are indexed by (@vol_id, @lnum) pairs. * * EBA also maintains the global sequence counter which is incremented each * time a logical eraseblock is mapped to a physical eraseblock and it is @@ -50,29 +50,6 @@ #define EBA_RESERVED_PEBS 1 /** - * struct ltree_entry - an entry in the lock tree. - * @rb: links RB-tree nodes - * @vol_id: volume ID of the locked logical eraseblock - * @lnum: locked logical eraseblock number - * @users: how many tasks are using this logical eraseblock or wait for it - * @mutex: read/write mutex to implement read/write access serialization to - * the (@vol_id, @lnum) logical eraseblock - * - * When a logical eraseblock is being locked - corresponding &struct ltree_entry - * object is inserted to the lock tree (@ubi->ltree). - */ -struct ltree_entry { - struct rb_node rb; - int vol_id; - int lnum; - int users; - struct rw_semaphore mutex; -}; - -/* Slab cache for lock-tree entries */ -static struct kmem_cache *ltree_slab; - -/** * next_sqnum - get next sequence number. * @ubi: UBI device description object * @@ -101,7 +78,7 @@ */ static int ubi_get_compat(const struct ubi_device *ubi, int vol_id) { - if (vol_id == UBI_LAYOUT_VOL_ID) + if (vol_id == UBI_LAYOUT_VOLUME_ID) return UBI_LAYOUT_VOLUME_COMPAT; return 0; } @@ -112,20 +89,20 @@ * @vol_id: volume ID * @lnum: logical eraseblock number * - * This function returns a pointer to the corresponding &struct ltree_entry + * This function returns a pointer to the corresponding &struct ubi_ltree_entry * object if the logical eraseblock is locked and %NULL if it is not. * @ubi->ltree_lock has to be locked. */ -static struct ltree_entry *ltree_lookup(struct ubi_device *ubi, int vol_id, - int lnum) +static struct ubi_ltree_entry *ltree_lookup(struct ubi_device *ubi, int vol_id, + int lnum) { struct rb_node *p; p = ubi->ltree.rb_node; while (p) { - struct ltree_entry *le; + struct ubi_ltree_entry *le; - le = rb_entry(p, struct ltree_entry, rb); + le = rb_entry(p, struct ubi_ltree_entry, rb); if (vol_id < le->vol_id) p = p->rb_left; @@ -155,15 +132,17 @@ * Returns pointer to the lock tree entry or %-ENOMEM if memory allocation * failed. */ -static struct ltree_entry *ltree_add_entry(struct ubi_device *ubi, int vol_id, - int lnum) +static struct ubi_ltree_entry *ltree_add_entry(struct ubi_device *ubi, + int vol_id, int lnum) { - struct ltree_entry *le, *le1, *le_free; + struct ubi_ltree_entry *le, *le1, *le_free; - le = kmem_cache_alloc(ltree_slab, GFP_NOFS); + le = kmalloc(sizeof(struct ubi_ltree_entry), GFP_NOFS); if (!le) return ERR_PTR(-ENOMEM); + le->users = 0; + init_rwsem(&le->mutex); le->vol_id = vol_id; le->lnum = lnum; @@ -189,7 +168,7 @@ p = &ubi->ltree.rb_node; while (*p) { parent = *p; - le1 = rb_entry(parent, struct ltree_entry, rb); + le1 = rb_entry(parent, struct ubi_ltree_entry, rb); if (vol_id < le1->vol_id) p = &(*p)->rb_left; @@ -210,9 +189,7 @@ le->users += 1; spin_unlock(&ubi->ltree_lock); - if (le_free) - kmem_cache_free(ltree_slab, le_free); - + kfree(le_free); return le; } @@ -227,7 +204,7 @@ */ static int leb_read_lock(struct ubi_device *ubi, int vol_id, int lnum) { - struct ltree_entry *le; + struct ubi_ltree_entry *le; le = ltree_add_entry(ubi, vol_id, lnum); if (IS_ERR(le)) @@ -244,22 +221,18 @@ */ static void leb_read_unlock(struct ubi_device *ubi, int vol_id, int lnum) { - int free = 0; - struct ltree_entry *le; + struct ubi_ltree_entry *le; spin_lock(&ubi->ltree_lock); le = ltree_lookup(ubi, vol_id, lnum); le->users -= 1; ubi_assert(le->users >= 0); + up_read(&le->mutex); if (le->users == 0) { rb_erase(&le->rb, &ubi->ltree); - free = 1; + kfree(le); } spin_unlock(&ubi->ltree_lock); - - up_read(&le->mutex); - if (free) - kmem_cache_free(ltree_slab, le); } /** @@ -273,7 +246,7 @@ */ static int leb_write_lock(struct ubi_device *ubi, int vol_id, int lnum) { - struct ltree_entry *le; + struct ubi_ltree_entry *le; le = ltree_add_entry(ubi, vol_id, lnum); if (IS_ERR(le)) @@ -283,6 +256,40 @@ } /** + * leb_write_lock - lock logical eraseblock for writing. + * @ubi: UBI device description object + * @vol_id: volume ID + * @lnum: logical eraseblock number + * + * This function locks a logical eraseblock for writing if there is no + * contention and does nothing if there is contention. Returns %0 in case of + * success, %1 in case of contention, and and a negative error code in case of + * failure. + */ +static int leb_write_trylock(struct ubi_device *ubi, int vol_id, int lnum) +{ + struct ubi_ltree_entry *le; + + le = ltree_add_entry(ubi, vol_id, lnum); + if (IS_ERR(le)) + return PTR_ERR(le); + if (down_write_trylock(&le->mutex)) + return 0; + + /* Contention, cancel */ + spin_lock(&ubi->ltree_lock); + le->users -= 1; + ubi_assert(le->users >= 0); + if (le->users == 0) { + rb_erase(&le->rb, &ubi->ltree); + kfree(le); + } + spin_unlock(&ubi->ltree_lock); + + return 1; +} + +/** * leb_write_unlock - unlock logical eraseblock. * @ubi: UBI device description object * @vol_id: volume ID @@ -290,39 +297,34 @@ */ static void leb_write_unlock(struct ubi_device *ubi, int vol_id, int lnum) { - int free; - struct ltree_entry *le; + struct ubi_ltree_entry *le; spin_lock(&ubi->ltree_lock); le = ltree_lookup(ubi, vol_id, lnum); le->users -= 1; ubi_assert(le->users >= 0); + up_write(&le->mutex); if (le->users == 0) { rb_erase(&le->rb, &ubi->ltree); - free = 1; - } else - free = 0; + kfree(le); + } spin_unlock(&ubi->ltree_lock); - - up_write(&le->mutex); - if (free) - kmem_cache_free(ltree_slab, le); } /** * ubi_eba_unmap_leb - un-map logical eraseblock. * @ubi: UBI device description object - * @vol_id: volume ID + * @vol: volume description object * @lnum: logical eraseblock number * * This function un-maps logical eraseblock @lnum and schedules corresponding * physical eraseblock for erasure. Returns zero in case of success and a * negative error code in case of failure. */ -int ubi_eba_unmap_leb(struct ubi_device *ubi, int vol_id, int lnum) +int ubi_eba_unmap_leb(struct ubi_device *ubi, struct ubi_volume *vol, + int lnum) { - int idx = vol_id2idx(ubi, vol_id), err, pnum; - struct ubi_volume *vol = ubi->volumes[idx]; + int err, pnum, vol_id = vol->vol_id; if (ubi->ro_mode) return -EROFS; @@ -349,7 +351,7 @@ /** * ubi_eba_read_leb - read data. * @ubi: UBI device description object - * @vol_id: volume ID + * @vol: volume description object * @lnum: logical eraseblock number * @buf: buffer to store the read data * @offset: offset from where to read @@ -365,12 +367,11 @@ * returned for any volume type if an ECC error was detected by the MTD device * driver. Other negative error cored may be returned in case of other errors. */ -int ubi_eba_read_leb(struct ubi_device *ubi, int vol_id, int lnum, void *buf, - int offset, int len, int check) +int ubi_eba_read_leb(struct ubi_device *ubi, struct ubi_volume *vol, int lnum, + void *buf, int offset, int len, int check) { - int err, pnum, scrub = 0, idx = vol_id2idx(ubi, vol_id); + int err, pnum, scrub = 0, vol_id = vol->vol_id; struct ubi_vid_hdr *vid_hdr; - struct ubi_volume *vol = ubi->volumes[idx]; uint32_t uninitialized_var(crc); err = leb_read_lock(ubi, vol_id, lnum); @@ -500,16 +501,12 @@ struct ubi_vid_hdr *vid_hdr; vid_hdr = ubi_zalloc_vid_hdr(ubi, GFP_NOFS); - if (!vid_hdr) { + if (!vid_hdr) return -ENOMEM; - } - - mutex_lock(&ubi->buf_mutex); retry: new_pnum = ubi_wl_get_peb(ubi, UBI_UNKNOWN); if (new_pnum < 0) { - mutex_unlock(&ubi->buf_mutex); ubi_free_vid_hdr(ubi, vid_hdr); return new_pnum; } @@ -529,20 +526,23 @@ goto write_error; data_size = offset + len; + mutex_lock(&ubi->buf_mutex); memset(ubi->peb_buf1 + offset, 0xFF, len); /* Read everything before the area where the write failure happened */ if (offset > 0) { err = ubi_io_read_data(ubi, ubi->peb_buf1, pnum, 0, offset); if (err && err != UBI_IO_BITFLIPS) - goto out_put; + goto out_unlock; } memcpy(ubi->peb_buf1 + offset, buf, len); err = ubi_io_write_data(ubi, ubi->peb_buf1, new_pnum, 0, data_size); - if (err) + if (err) { + mutex_unlock(&ubi->buf_mutex); goto write_error; + } mutex_unlock(&ubi->buf_mutex); ubi_free_vid_hdr(ubi, vid_hdr); @@ -553,8 +553,9 @@ ubi_msg("data was successfully recovered"); return 0; -out_put: +out_unlock: mutex_unlock(&ubi->buf_mutex); +out_put: ubi_wl_put_peb(ubi, new_pnum, 1); ubi_free_vid_hdr(ubi, vid_hdr); return err; @@ -567,7 +568,6 @@ ubi_warn("failed to write to PEB %d", new_pnum); ubi_wl_put_peb(ubi, new_pnum, 1); if (++tries > UBI_IO_RETRIES) { - mutex_unlock(&ubi->buf_mutex); ubi_free_vid_hdr(ubi, vid_hdr); return err; } @@ -578,7 +578,7 @@ /** * ubi_eba_write_leb - write data to dynamic volume. * @ubi: UBI device description object - * @vol_id: volume ID + * @vol: volume description object * @lnum: logical eraseblock number * @buf: the data to write * @offset: offset within the logical eraseblock where to write @@ -586,15 +586,14 @@ * @dtype: data type * * This function writes data to logical eraseblock @lnum of a dynamic volume - * @vol_id. Returns zero in case of success and a negative error code in case + * @vol. Returns zero in case of success and a negative error code in case * of failure. In case of error, it is possible that something was still * written to the flash media, but may be some garbage. */ -int ubi_eba_write_leb(struct ubi_device *ubi, int vol_id, int lnum, +int ubi_eba_write_leb(struct ubi_device *ubi, struct ubi_volume *vol, int lnum, const void *buf, int offset, int len, int dtype) { - int idx = vol_id2idx(ubi, vol_id), err, pnum, tries = 0; - struct ubi_volume *vol = ubi->volumes[idx]; + int err, pnum, tries = 0, vol_id = vol->vol_id; struct ubi_vid_hdr *vid_hdr; if (ubi->ro_mode) @@ -613,7 +612,8 @@ if (err) { ubi_warn("failed to write data to PEB %d", pnum); if (err == -EIO && ubi->bad_allowed) - err = recover_peb(ubi, pnum, vol_id, lnum, buf, offset, len); + err = recover_peb(ubi, pnum, vol_id, lnum, buf, + offset, len); if (err) ubi_ro_mode(ubi); } @@ -656,11 +656,14 @@ goto write_error; } - err = ubi_io_write_data(ubi, buf, pnum, offset, len); - if (err) { - ubi_warn("failed to write %d bytes at offset %d of LEB %d:%d, " - "PEB %d", len, offset, vol_id, lnum, pnum); - goto write_error; + if (len) { + err = ubi_io_write_data(ubi, buf, pnum, offset, len); + if (err) { + ubi_warn("failed to write %d bytes at offset %d of " + "LEB %d:%d, PEB %d", len, offset, vol_id, + lnum, pnum); + goto write_error; + } } vol->eba_tbl[lnum] = pnum; @@ -698,7 +701,7 @@ /** * ubi_eba_write_leb_st - write data to static volume. * @ubi: UBI device description object - * @vol_id: volume ID + * @vol: volume description object * @lnum: logical eraseblock number * @buf: data to write * @len: how many bytes to write @@ -706,7 +709,7 @@ * @used_ebs: how many logical eraseblocks will this volume contain * * This function writes data to logical eraseblock @lnum of static volume - * @vol_id. The @used_ebs argument should contain total number of logical + * @vol. The @used_ebs argument should contain total number of logical * eraseblock in this static volume. * * When writing to the last logical eraseblock, the @len argument doesn't have @@ -718,12 +721,11 @@ * volumes. This function returns zero in case of success and a negative error * code in case of failure. */ -int ubi_eba_write_leb_st(struct ubi_device *ubi, int vol_id, int lnum, - const void *buf, int len, int dtype, int used_ebs) +int ubi_eba_write_leb_st(struct ubi_device *ubi, struct ubi_volume *vol, + int lnum, const void *buf, int len, int dtype, + int used_ebs) { - int err, pnum, tries = 0, data_size = len; - int idx = vol_id2idx(ubi, vol_id); - struct ubi_volume *vol = ubi->volumes[idx]; + int err, pnum, tries = 0, data_size = len, vol_id = vol->vol_id; struct ubi_vid_hdr *vid_hdr; uint32_t crc; @@ -734,7 +736,7 @@ /* If this is the last LEB @len may be unaligned */ len = ALIGN(data_size, ubi->min_io_size); else - ubi_assert(len % ubi->min_io_size == 0); + ubi_assert(!(len & (ubi->min_io_size - 1))); vid_hdr = ubi_zalloc_vid_hdr(ubi, GFP_NOFS); if (!vid_hdr) @@ -819,7 +821,7 @@ /* * ubi_eba_atomic_leb_change - change logical eraseblock atomically. * @ubi: UBI device description object - * @vol_id: volume ID + * @vol: volume description object * @lnum: logical eraseblock number * @buf: data to write * @len: how many bytes to write @@ -834,17 +836,27 @@ * UBI reserves one LEB for the "atomic LEB change" operation, so only one * LEB change may be done at a time. This is ensured by @ubi->alc_mutex. */ -int ubi_eba_atomic_leb_change(struct ubi_device *ubi, int vol_id, int lnum, - const void *buf, int len, int dtype) +int ubi_eba_atomic_leb_change(struct ubi_device *ubi, struct ubi_volume *vol, + int lnum, const void *buf, int len, int dtype) { - int err, pnum, tries = 0, idx = vol_id2idx(ubi, vol_id); - struct ubi_volume *vol = ubi->volumes[idx]; + int err, pnum, tries = 0, vol_id = vol->vol_id; struct ubi_vid_hdr *vid_hdr; uint32_t crc; if (ubi->ro_mode) return -EROFS; + if (len == 0) { + /* + * Special case when data length is zero. In this case the LEB + * has to be unmapped and mapped somewhere else. + */ + err = ubi_eba_unmap_leb(ubi, vol, lnum); + if (err) + return err; + return ubi_eba_write_leb(ubi, vol, lnum, NULL, 0, 0, dtype); + } + vid_hdr = ubi_zalloc_vid_hdr(ubi, GFP_NOFS); if (!vid_hdr) return -ENOMEM; @@ -891,7 +903,7 @@ } if (vol->eba_tbl[lnum] >= 0) { - err = ubi_wl_put_peb(ubi, vol->eba_tbl[lnum], 1); + err = ubi_wl_put_peb(ubi, vol->eba_tbl[lnum], 0); if (err) goto out_leb_unlock; } @@ -928,20 +940,6 @@ } /** - * ltree_entry_ctor - lock tree entries slab cache constructor. - * @obj: the lock-tree entry to construct - * @cache: the lock tree entry slab cache - * @flags: constructor flags - */ -static void ltree_entry_ctor(struct kmem_cache *cache, void *obj) -{ - struct ltree_entry *le = obj; - - le->users = 0; - init_rwsem(&le->mutex); -} - -/** * ubi_eba_copy_leb - copy logical eraseblock. * @ubi: UBI device description object * @from: physical eraseblock number from where to copy @@ -950,14 +948,20 @@ * * This function copies logical eraseblock from physical eraseblock @from to * physical eraseblock @to. The @vid_hdr buffer may be changed by this - * function. Returns zero in case of success, %UBI_IO_BITFLIPS if the operation - * was canceled because bit-flips were detected at the target PEB, and a - * negative error code in case of failure. + * function. Returns: + * o %0 in case of success; + * o %1 if the operation was canceled because the volume is being deleted + * or because the PEB was put meanwhile; + * o %2 if the operation was canceled because there was a write error to the + * target PEB; + * o %-EAGAIN if the operation was canceled because a bit-flip was detected + * in the target PEB; + * o a negative error code in case of failure. */ int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to, struct ubi_vid_hdr *vid_hdr) { - int err, vol_id, lnum, data_size, aldata_size, pnum, idx; + int err, vol_id, lnum, data_size, aldata_size, idx; struct ubi_volume *vol; uint32_t crc; @@ -973,51 +977,67 @@ data_size = aldata_size = ubi->leb_size - be32_to_cpu(vid_hdr->data_pad); - /* - * We do not want anybody to write to this logical eraseblock while we - * are moving it, so we lock it. - */ - err = leb_write_lock(ubi, vol_id, lnum); - if (err) - return err; - - mutex_lock(&ubi->buf_mutex); - - /* - * But the logical eraseblock might have been put by this time. - * Cancel if it is true. - */ idx = vol_id2idx(ubi, vol_id); - + spin_lock(&ubi->volumes_lock); /* - * We may race with volume deletion/re-size, so we have to hold - * @ubi->volumes_lock. + * Note, we may race with volume deletion, which means that the volume + * this logical eraseblock belongs to might be being deleted. Since the + * volume deletion un-maps all the volume's logical eraseblocks, it will + * be locked in 'ubi_wl_put_peb()' and wait for the WL worker to finish. */ - spin_lock(&ubi->volumes_lock); vol = ubi->volumes[idx]; if (!vol) { - dbg_eba("volume %d was removed meanwhile", vol_id); + /* No need to do further work, cancel */ + dbg_eba("volume %d is being removed, cancel", vol_id); spin_unlock(&ubi->volumes_lock); - goto out_unlock; + return 1; } + spin_unlock(&ubi->volumes_lock); - pnum = vol->eba_tbl[lnum]; - if (pnum != from) { - dbg_eba("LEB %d:%d is no longer mapped to PEB %d, mapped to " - "PEB %d, cancel", vol_id, lnum, from, pnum); - spin_unlock(&ubi->volumes_lock); - goto out_unlock; + /* + * We do not want anybody to write to this logical eraseblock while we + * are moving it, so lock it. + * + * Note, we are using non-waiting locking here, because we cannot sleep + * on the LEB, since it may cause deadlocks. Indeed, imagine a task is + * unmapping the LEB which is mapped to the PEB we are going to move + * (@from). This task locks the LEB and goes sleep in the + * 'ubi_wl_put_peb()' function on the @ubi->move_mutex. In turn, we are + * holding @ubi->move_mutex and go sleep on the LEB lock. So, if the + * LEB is already locked, we just do not move it and return %1. + */ + err = leb_write_trylock(ubi, vol_id, lnum); + if (err) { + dbg_eba("contention on LEB %d:%d, cancel", vol_id, lnum); + return err; } - spin_unlock(&ubi->volumes_lock); - /* OK, now the LEB is locked and we can safely start moving it */ + /* + * The LEB might have been put meanwhile, and the task which put it is + * probably waiting on @ubi->move_mutex. No need to continue the work, + * cancel it. + */ + if (vol->eba_tbl[lnum] != from) { + dbg_eba("LEB %d:%d is no longer mapped to PEB %d, mapped to " + "PEB %d, cancel", vol_id, lnum, from, + vol->eba_tbl[lnum]); + err = 1; + goto out_unlock_leb; + } + /* + * OK, now the LEB is locked and we can safely start moving it. Since + * this function utilizes the @ubi->peb1_buf buffer which is shared + * with some other functions, so lock the buffer by taking the + * @ubi->buf_mutex. + */ + mutex_lock(&ubi->buf_mutex); dbg_eba("read %d bytes of data", aldata_size); err = ubi_io_read_data(ubi, ubi->peb_buf1, from, 0, aldata_size); if (err && err != UBI_IO_BITFLIPS) { ubi_warn("error %d while reading data from PEB %d", err, from); - goto out_unlock; + goto out_unlock_buf; } /* @@ -1052,8 +1072,11 @@ vid_hdr->sqnum = cpu_to_be64(next_sqnum(ubi)); err = ubi_io_write_vid_hdr(ubi, to, vid_hdr); - if (err) - goto out_unlock; + if (err) { + if (err == -EIO) + err = 2; + goto out_unlock_buf; + } cond_resched(); @@ -1062,13 +1085,18 @@ if (err) { if (err != UBI_IO_BITFLIPS) ubi_warn("cannot read VID header back from PEB %d", to); - goto out_unlock; + else + err = -EAGAIN; + goto out_unlock_buf; } if (data_size > 0) { err = ubi_io_write_data(ubi, ubi->peb_buf1, to, 0, aldata_size); - if (err) - goto out_unlock; + if (err) { + if (err == -EIO) + err = 2; + goto out_unlock_buf; + } cond_resched(); @@ -1082,29 +1110,33 @@ if (err != UBI_IO_BITFLIPS) ubi_warn("cannot read data back from PEB %d", to); - goto out_unlock; + else + err = -EAGAIN; + goto out_unlock_buf; } cond_resched(); if (memcmp(ubi->peb_buf1, ubi->peb_buf2, aldata_size)) { - ubi_warn("read data back from PEB %d - it is different", - to); - goto out_unlock; + ubi_warn("read data back from PEB %d and it is " + "different", to); + err = -EINVAL; + goto out_unlock_buf; } } ubi_assert(vol->eba_tbl[lnum] == from); vol->eba_tbl[lnum] = to; -out_unlock: +out_unlock_buf: mutex_unlock(&ubi->buf_mutex); +out_unlock_leb: leb_write_unlock(ubi, vol_id, lnum); return err; } /** - * ubi_eba_init_scan - initialize the EBA unit using scanning information. + * ubi_eba_init_scan - initialize the EBA sub-system using scanning information. * @ubi: UBI device description object * @si: scanning information * @@ -1119,20 +1151,12 @@ struct ubi_scan_leb *seb; struct rb_node *rb; - dbg_eba("initialize EBA unit"); + dbg_eba("initialize EBA sub-system"); spin_lock_init(&ubi->ltree_lock); mutex_init(&ubi->alc_mutex); ubi->ltree = RB_ROOT; - if (ubi_devices_cnt == 0) { - ltree_slab = kmem_cache_create("ubi_ltree_slab", - sizeof(struct ltree_entry), 0, - 0, <ree_entry_ctor); - if (!ltree_slab) - return -ENOMEM; - } - ubi->global_sqnum = si->max_sqnum + 1; num_volumes = ubi->vtbl_slots + UBI_INT_VOL_COUNT; @@ -1168,6 +1192,15 @@ } } + if (ubi->avail_pebs < EBA_RESERVED_PEBS) { + ubi_err("no enough physical eraseblocks (%d, need %d)", + ubi->avail_pebs, EBA_RESERVED_PEBS); + err = -ENOSPC; + goto out_free; + } + ubi->avail_pebs -= EBA_RESERVED_PEBS; + ubi->rsvd_pebs += EBA_RESERVED_PEBS; + if (ubi->bad_allowed) { ubi_calculate_reserved(ubi); @@ -1184,16 +1217,7 @@ ubi->rsvd_pebs += ubi->beb_rsvd_pebs; } - if (ubi->avail_pebs < EBA_RESERVED_PEBS) { - ubi_err("no enough physical eraseblocks (%d, need %d)", - ubi->avail_pebs, EBA_RESERVED_PEBS); - err = -ENOSPC; - goto out_free; - } - ubi->avail_pebs -= EBA_RESERVED_PEBS; - ubi->rsvd_pebs += EBA_RESERVED_PEBS; - - dbg_eba("EBA unit is initialized"); + dbg_eba("EBA sub-system is initialized"); return 0; out_free: @@ -1202,26 +1226,5 @@ continue; kfree(ubi->volumes[i]->eba_tbl); } - if (ubi_devices_cnt == 0) - kmem_cache_destroy(ltree_slab); return err; } - -/** - * ubi_eba_close - close EBA unit. - * @ubi: UBI device description object - */ -void ubi_eba_close(const struct ubi_device *ubi) -{ - int i, num_volumes = ubi->vtbl_slots + UBI_INT_VOL_COUNT; - - dbg_eba("close EBA unit"); - - for (i = 0; i < num_volumes; i++) { - if (!ubi->volumes[i]) - continue; - kfree(ubi->volumes[i]->eba_tbl); - } - if (ubi_devices_cnt == 1) - kmem_cache_destroy(ltree_slab); -} diff -Nurd linux-2.6.24/drivers/mtd/ubi/gluebi.c ubifs-v2.6.24/drivers/mtd/ubi/gluebi.c --- linux-2.6.24/drivers/mtd/ubi/gluebi.c 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/gluebi.c 2009-04-07 17:14:47.000000000 +0200 @@ -28,7 +28,7 @@ * eraseblock size is equivalent to the logical eraseblock size of the volume. */ -#include +#include #include "ubi.h" /** @@ -109,9 +109,8 @@ int err = 0, lnum, offs, total_read; struct ubi_volume *vol; struct ubi_device *ubi; - uint64_t tmp = from; - dbg_msg("read %zd bytes from offset %lld", len, from); + dbg_gen("read %zd bytes from offset %lld", len, from); if (len < 0 || from < 0 || from + len > mtd->size) return -EINVAL; @@ -119,9 +118,7 @@ vol = container_of(mtd, struct ubi_volume, gluebi_mtd); ubi = vol->ubi; - offs = do_div(tmp, mtd->erasesize); - lnum = tmp; - + lnum = div_u64_rem(from, mtd->erasesize, &offs); total_read = len; while (total_read) { size_t to_read = mtd->erasesize - offs; @@ -129,8 +126,7 @@ if (to_read > total_read) to_read = total_read; - err = ubi_eba_read_leb(ubi, vol->vol_id, lnum, buf, offs, - to_read, 0); + err = ubi_eba_read_leb(ubi, vol, lnum, buf, offs, to_read, 0); if (err) break; @@ -161,9 +157,8 @@ int err = 0, lnum, offs, total_written; struct ubi_volume *vol; struct ubi_device *ubi; - uint64_t tmp = to; - dbg_msg("write %zd bytes to offset %lld", len, to); + dbg_gen("write %zd bytes to offset %lld", len, to); if (len < 0 || to < 0 || len + to > mtd->size) return -EINVAL; @@ -174,8 +169,7 @@ if (ubi->ro_mode) return -EROFS; - offs = do_div(tmp, mtd->erasesize); - lnum = tmp; + lnum = div_u64_rem(to, mtd->erasesize, &offs); if (len % mtd->writesize || offs % mtd->writesize) return -EINVAL; @@ -187,8 +181,8 @@ if (to_write > total_written) to_write = total_written; - err = ubi_eba_write_leb(ubi, vol->vol_id, lnum, buf, offs, - to_write, UBI_UNKNOWN); + err = ubi_eba_write_leb(ubi, vol, lnum, buf, offs, to_write, + UBI_UNKNOWN); if (err) break; @@ -216,7 +210,7 @@ struct ubi_volume *vol; struct ubi_device *ubi; - dbg_msg("erase %u bytes at offset %u", instr->len, instr->addr); + dbg_gen("erase %u bytes at offset %u", instr->len, instr->addr); if (instr->addr < 0 || instr->addr > mtd->size - mtd->erasesize) return -EINVAL; @@ -237,7 +231,7 @@ return -EROFS; for (i = 0; i < count; i++) { - err = ubi_eba_unmap_leb(ubi, vol->vol_id, lnum + i); + err = ubi_eba_unmap_leb(ubi, vol, lnum + i); if (err) goto out_err; } @@ -250,8 +244,8 @@ if (err) goto out_err; - instr->state = MTD_ERASE_DONE; - mtd_erase_callback(instr); + instr->state = MTD_ERASE_DONE; + mtd_erase_callback(instr); return 0; out_err: @@ -292,19 +286,20 @@ /* * In case of dynamic volume, MTD device size is just volume size. In * case of a static volume the size is equivalent to the amount of data - * bytes, which is zero at this moment and will be changed after volume - * update. + * bytes. */ if (vol->vol_type == UBI_DYNAMIC_VOLUME) mtd->size = vol->usable_leb_size * vol->reserved_pebs; + else + mtd->size = vol->used_bytes; if (add_mtd_device(mtd)) { - ubi_err("cannot not add MTD device\n"); + ubi_err("cannot not add MTD device"); kfree(mtd->name); return -ENFILE; } - dbg_msg("added mtd%d (\"%s\"), size %u, EB size %u", + dbg_gen("added mtd%d (\"%s\"), size %u, EB size %u", mtd->index, mtd->name, mtd->size, mtd->erasesize); return 0; } @@ -322,7 +317,7 @@ int err; struct mtd_info *mtd = &vol->gluebi_mtd; - dbg_msg("remove mtd%d", mtd->index); + dbg_gen("remove mtd%d", mtd->index); err = del_mtd_device(mtd); if (err) return err; diff -Nurd linux-2.6.24/drivers/mtd/ubi/io.c ubifs-v2.6.24/drivers/mtd/ubi/io.c --- linux-2.6.24/drivers/mtd/ubi/io.c 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/io.c 2009-04-07 17:14:47.000000000 +0200 @@ -20,15 +20,15 @@ */ /* - * UBI input/output unit. + * UBI input/output sub-system. * - * This unit provides a uniform way to work with all kinds of the underlying - * MTD devices. It also implements handy functions for reading and writing UBI - * headers. + * This sub-system provides a uniform way to work with all kinds of the + * underlying MTD devices. It also implements handy functions for reading and + * writing UBI headers. * * We are trying to have a paranoid mindset and not to trust to what we read - * from the flash media in order to be more secure and robust. So this unit - * validates every single header it reads from the flash media. + * from the flash media in order to be more secure and robust. So this + * sub-system validates every single header it reads from the flash media. * * Some words about how the eraseblock headers are stored. * @@ -79,11 +79,11 @@ * 512-byte chunks, we have to allocate one more buffer and copy our VID header * to offset 448 of this buffer. * - * The I/O unit does the following trick in order to avoid this extra copy. - * It always allocates a @ubi->vid_hdr_alsize bytes buffer for the VID header - * and returns a pointer to offset @ubi->vid_hdr_shift of this buffer. When the - * VID header is being written out, it shifts the VID header pointer back and - * writes the whole sub-page. + * The I/O sub-system does the following trick in order to avoid this extra + * copy. It always allocates a @ubi->vid_hdr_alsize bytes buffer for the VID + * header and returns a pointer to offset @ubi->vid_hdr_shift of this buffer. + * When the VID header is being written out, it shifts the VID header pointer + * back and writes the whole sub-page. */ #include @@ -156,15 +156,19 @@ /* * -EUCLEAN is reported if there was a bit-flip which * was corrected, so this is harmless. + * + * We do not report about it here unless debugging is + * enabled. A corresponding message will be printed + * later, when it is has been scrubbed. */ - ubi_msg("fixable bit-flip detected at PEB %d", pnum); + dbg_msg("fixable bit-flip detected at PEB %d", pnum); ubi_assert(len == read); return UBI_IO_BITFLIPS; } if (read != len && retries++ < UBI_IO_RETRIES) { - dbg_io("error %d while reading %d bytes from PEB %d:%d, " - "read only %zd bytes, retry", + dbg_io("error %d while reading %d bytes from PEB %d:%d," + " read only %zd bytes, retry", err, len, pnum, offset, read); yield(); goto retry; @@ -173,11 +177,21 @@ ubi_err("error %d while reading %d bytes from PEB %d:%d, " "read %zd bytes", err, len, pnum, offset, read); ubi_dbg_dump_stack(); + + /* + * The driver should never return -EBADMSG if it failed to read + * all the requested data. But some buggy drivers might do + * this, so we change it to -EIO. + */ + if (read != len && err == -EBADMSG) { + ubi_assert(0); + err = -EIO; + } } else { ubi_assert(len == read); if (ubi_dbg_is_bitflip()) { - dbg_msg("bit-flip (emulated)"); + dbg_gen("bit-flip (emulated)"); err = UBI_IO_BITFLIPS; } } @@ -381,6 +395,7 @@ { int err, i, patt_count; + ubi_msg("run torture test for PEB %d", pnum); patt_count = ARRAY_SIZE(patterns); ubi_assert(patt_count > 0); @@ -424,6 +439,7 @@ } err = patt_count; + ubi_msg("PEB %d passed torture test, do not mark it a bad", pnum); out: mutex_unlock(&ubi->buf_mutex); @@ -667,6 +683,9 @@ if (verbose) ubi_warn("no EC header found at PEB %d, " "only 0xFF bytes", pnum); + else if (UBI_IO_DEBUG) + dbg_msg("no EC header found at PEB %d, " + "only 0xFF bytes", pnum); return UBI_IO_PEB_EMPTY; } @@ -678,7 +697,9 @@ ubi_warn("bad magic number at PEB %d: %08x instead of " "%08x", pnum, magic, UBI_EC_HDR_MAGIC); ubi_dbg_dump_ec_hdr(ec_hdr); - } + } else if (UBI_IO_DEBUG) + dbg_msg("bad magic number at PEB %d: %08x instead of " + "%08x", pnum, magic, UBI_EC_HDR_MAGIC); return UBI_IO_BAD_EC_HDR; } @@ -687,10 +708,12 @@ if (hdr_crc != crc) { if (verbose) { - ubi_warn("bad EC header CRC at PEB %d, calculated %#08x," - " read %#08x", pnum, crc, hdr_crc); + ubi_warn("bad EC header CRC at PEB %d, calculated " + "%#08x, read %#08x", pnum, crc, hdr_crc); ubi_dbg_dump_ec_hdr(ec_hdr); - } + } else if (UBI_IO_DEBUG) + dbg_msg("bad EC header CRC at PEB %d, calculated " + "%#08x, read %#08x", pnum, crc, hdr_crc); return UBI_IO_BAD_EC_HDR; } @@ -940,6 +963,9 @@ if (verbose) ubi_warn("no VID header found at PEB %d, " "only 0xFF bytes", pnum); + else if (UBI_IO_DEBUG) + dbg_msg("no VID header found at PEB %d, " + "only 0xFF bytes", pnum); return UBI_IO_PEB_FREE; } @@ -951,7 +977,9 @@ ubi_warn("bad magic number at PEB %d: %08x instead of " "%08x", pnum, magic, UBI_VID_HDR_MAGIC); ubi_dbg_dump_vid_hdr(vid_hdr); - } + } else if (UBI_IO_DEBUG) + dbg_msg("bad magic number at PEB %d: %08x instead of " + "%08x", pnum, magic, UBI_VID_HDR_MAGIC); return UBI_IO_BAD_VID_HDR; } @@ -963,7 +991,9 @@ ubi_warn("bad CRC at PEB %d, calculated %#08x, " "read %#08x", pnum, crc, hdr_crc); ubi_dbg_dump_vid_hdr(vid_hdr); - } + } else if (UBI_IO_DEBUG) + dbg_msg("bad CRC at PEB %d, calculated %#08x, " + "read %#08x", pnum, crc, hdr_crc); return UBI_IO_BAD_VID_HDR; } @@ -1004,7 +1034,7 @@ err = paranoid_check_peb_ec_hdr(ubi, pnum); if (err) - return err > 0 ? -EINVAL: err; + return err > 0 ? -EINVAL : err; vid_hdr->magic = cpu_to_be32(UBI_VID_HDR_MAGIC); vid_hdr->version = UBI_VERSION; @@ -1081,8 +1111,7 @@ } /** - * paranoid_check_peb_ec_hdr - check that the erase counter header of a - * physical eraseblock is in-place and is all right. + * paranoid_check_peb_ec_hdr - check erase counter header. * @ubi: UBI device description object * @pnum: the physical eraseblock number to check * @@ -1160,8 +1189,7 @@ } /** - * paranoid_check_peb_vid_hdr - check that the volume identifier header of a - * physical eraseblock is in-place and is all right. + * paranoid_check_peb_vid_hdr - check volume identifier header. * @ubi: UBI device description object * @pnum: the physical eraseblock number to check * @@ -1242,7 +1270,7 @@ fail: ubi_err("paranoid check failed for PEB %d", pnum); - dbg_msg("hex dump of the %d-%d region", offset, offset + len); + ubi_msg("hex dump of the %d-%d region", offset, offset + len); print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, ubi->dbg_peb_buf, len, 1); err = 1; diff -Nurd linux-2.6.24/drivers/mtd/ubi/kapi.c ubifs-v2.6.24/drivers/mtd/ubi/kapi.c --- linux-2.6.24/drivers/mtd/ubi/kapi.c 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/kapi.c 2009-04-07 17:14:47.000000000 +0200 @@ -30,23 +30,27 @@ * @ubi_num: UBI device number * @di: the information is stored here * - * This function returns %0 in case of success and a %-ENODEV if there is no - * such UBI device. + * This function returns %0 in case of success, %-EINVAL if the UBI device + * number is invalid, and %-ENODEV if there is no such UBI device. */ int ubi_get_device_info(int ubi_num, struct ubi_device_info *di) { - const struct ubi_device *ubi; + struct ubi_device *ubi; - if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES || - !ubi_devices[ubi_num]) + if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES) + return -EINVAL; + + ubi = ubi_get_device(ubi_num); + if (!ubi) return -ENODEV; - ubi = ubi_devices[ubi_num]; di->ubi_num = ubi->ubi_num; di->leb_size = ubi->leb_size; di->min_io_size = ubi->min_io_size; di->ro_mode = ubi->ro_mode; - di->cdev = MKDEV(ubi->major, 0); + di->cdev = ubi->cdev.dev; + + ubi_put_device(ubi); return 0; } EXPORT_SYMBOL_GPL(ubi_get_device_info); @@ -73,7 +77,7 @@ vi->usable_leb_size = vol->usable_leb_size; vi->name_len = vol->name_len; vi->name = vol->name; - vi->cdev = MKDEV(ubi->major, vi->vol_id + 1); + vi->cdev = vol->cdev.dev; } EXPORT_SYMBOL_GPL(ubi_get_volume_info); @@ -102,39 +106,41 @@ struct ubi_device *ubi; struct ubi_volume *vol; - dbg_msg("open device %d volume %d, mode %d", ubi_num, vol_id, mode); - - err = -ENODEV; - if (ubi_num < 0) - return ERR_PTR(err); - - ubi = ubi_devices[ubi_num]; - - if (!try_module_get(THIS_MODULE)) - return ERR_PTR(err); + dbg_gen("open device %d volume %d, mode %d", ubi_num, vol_id, mode); - if (ubi_num >= UBI_MAX_DEVICES || !ubi) - goto out_put; + if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES) + return ERR_PTR(-EINVAL); - err = -EINVAL; - if (vol_id < 0 || vol_id >= ubi->vtbl_slots) - goto out_put; if (mode != UBI_READONLY && mode != UBI_READWRITE && mode != UBI_EXCLUSIVE) - goto out_put; + return ERR_PTR(-EINVAL); + + /* + * First of all, we have to get the UBI device to prevent its removal. + */ + ubi = ubi_get_device(ubi_num); + if (!ubi) + return ERR_PTR(-ENODEV); + + if (vol_id < 0 || vol_id >= ubi->vtbl_slots) { + err = -EINVAL; + goto out_put_ubi; + } desc = kmalloc(sizeof(struct ubi_volume_desc), GFP_KERNEL); if (!desc) { err = -ENOMEM; - goto out_put; + goto out_put_ubi; } + err = -ENODEV; + if (!try_module_get(THIS_MODULE)) + goto out_free; + spin_lock(&ubi->volumes_lock); vol = ubi->volumes[vol_id]; - if (!vol) { - err = -ENODEV; + if (!vol) goto out_unlock; - } err = -EBUSY; switch (mode) { @@ -156,21 +162,19 @@ vol->exclusive = 1; break; } + get_device(&vol->dev); + vol->ref_count += 1; spin_unlock(&ubi->volumes_lock); desc->vol = vol; desc->mode = mode; - /* - * To prevent simultaneous checks of the same volume we use @vtbl_mutex, - * although it is not the purpose it was introduced for. - */ - mutex_lock(&ubi->vtbl_mutex); + mutex_lock(&ubi->ckvol_mutex); if (!vol->checked) { /* This is the first open - check the volume */ err = ubi_check_volume(ubi, vol_id); if (err < 0) { - mutex_unlock(&ubi->vtbl_mutex); + mutex_unlock(&ubi->ckvol_mutex); ubi_close_volume(desc); return ERR_PTR(err); } @@ -181,14 +185,17 @@ } vol->checked = 1; } - mutex_unlock(&ubi->vtbl_mutex); + mutex_unlock(&ubi->ckvol_mutex); + return desc; out_unlock: spin_unlock(&ubi->volumes_lock); - kfree(desc); -out_put: module_put(THIS_MODULE); +out_free: + kfree(desc); +out_put_ubi: + ubi_put_device(ubi); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(ubi_open_volume); @@ -205,10 +212,10 @@ int mode) { int i, vol_id = -1, len; - struct ubi_volume_desc *ret; struct ubi_device *ubi; + struct ubi_volume_desc *ret; - dbg_msg("open volume %s, mode %d", name, mode); + dbg_gen("open volume %s, mode %d", name, mode); if (!name) return ERR_PTR(-EINVAL); @@ -217,14 +224,12 @@ if (len > UBI_VOL_NAME_MAX) return ERR_PTR(-EINVAL); - ret = ERR_PTR(-ENODEV); - if (!try_module_get(THIS_MODULE)) - return ret; - - if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES || !ubi_devices[ubi_num]) - goto out_put; + if (ubi_num < 0 || ubi_num >= UBI_MAX_DEVICES) + return ERR_PTR(-EINVAL); - ubi = ubi_devices[ubi_num]; + ubi = ubi_get_device(ubi_num); + if (!ubi) + return ERR_PTR(-ENODEV); spin_lock(&ubi->volumes_lock); /* Walk all volumes of this UBI device */ @@ -238,13 +243,16 @@ } spin_unlock(&ubi->volumes_lock); - if (vol_id < 0) - goto out_put; - - ret = ubi_open_volume(ubi_num, vol_id, mode); + if (vol_id >= 0) + ret = ubi_open_volume(ubi_num, vol_id, mode); + else + ret = ERR_PTR(-ENODEV); -out_put: - module_put(THIS_MODULE); + /* + * We should put the UBI device even in case of success, because + * 'ubi_open_volume()' took a reference as well. + */ + ubi_put_device(ubi); return ret; } EXPORT_SYMBOL_GPL(ubi_open_volume_nm); @@ -256,10 +264,11 @@ void ubi_close_volume(struct ubi_volume_desc *desc) { struct ubi_volume *vol = desc->vol; + struct ubi_device *ubi = vol->ubi; - dbg_msg("close volume %d, mode %d", vol->vol_id, desc->mode); + dbg_gen("close volume %d, mode %d", vol->vol_id, desc->mode); - spin_lock(&vol->ubi->volumes_lock); + spin_lock(&ubi->volumes_lock); switch (desc->mode) { case UBI_READONLY: vol->readers -= 1; @@ -270,9 +279,12 @@ case UBI_EXCLUSIVE: vol->exclusive = 0; } - spin_unlock(&vol->ubi->volumes_lock); + vol->ref_count -= 1; + spin_unlock(&ubi->volumes_lock); kfree(desc); + put_device(&vol->dev); + ubi_put_device(ubi); module_put(THIS_MODULE); } EXPORT_SYMBOL_GPL(ubi_close_volume); @@ -311,7 +323,7 @@ struct ubi_device *ubi = vol->ubi; int err, vol_id = vol->vol_id; - dbg_msg("read %d bytes from LEB %d:%d:%d", len, vol_id, lnum, offset); + dbg_gen("read %d bytes from LEB %d:%d:%d", len, vol_id, lnum, offset); if (vol_id < 0 || vol_id >= ubi->vtbl_slots || lnum < 0 || lnum >= vol->used_ebs || offset < 0 || len < 0 || @@ -332,7 +344,7 @@ if (len == 0) return 0; - err = ubi_eba_read_leb(ubi, vol_id, lnum, buf, offset, len, check); + err = ubi_eba_read_leb(ubi, vol, lnum, buf, offset, len, check); if (err && err == -EBADMSG && vol->vol_type == UBI_STATIC_VOLUME) { ubi_warn("mark volume %d as corrupted", vol_id); vol->corrupted = 1; @@ -376,7 +388,7 @@ struct ubi_device *ubi = vol->ubi; int vol_id = vol->vol_id; - dbg_msg("write %d bytes to LEB %d:%d:%d", len, vol_id, lnum, offset); + dbg_gen("write %d bytes to LEB %d:%d:%d", len, vol_id, lnum, offset); if (vol_id < 0 || vol_id >= ubi->vtbl_slots) return -EINVAL; @@ -385,8 +397,8 @@ return -EROFS; if (lnum < 0 || lnum >= vol->reserved_pebs || offset < 0 || len < 0 || - offset + len > vol->usable_leb_size || offset % ubi->min_io_size || - len % ubi->min_io_size) + offset + len > vol->usable_leb_size || + offset & (ubi->min_io_size - 1) || len & (ubi->min_io_size - 1)) return -EINVAL; if (dtype != UBI_LONGTERM && dtype != UBI_SHORTTERM && @@ -399,7 +411,7 @@ if (len == 0) return 0; - return ubi_eba_write_leb(ubi, vol_id, lnum, buf, offset, len, dtype); + return ubi_eba_write_leb(ubi, vol, lnum, buf, offset, len, dtype); } EXPORT_SYMBOL_GPL(ubi_leb_write); @@ -426,7 +438,7 @@ struct ubi_device *ubi = vol->ubi; int vol_id = vol->vol_id; - dbg_msg("atomically write %d bytes to LEB %d:%d", len, vol_id, lnum); + dbg_gen("atomically write %d bytes to LEB %d:%d", len, vol_id, lnum); if (vol_id < 0 || vol_id >= ubi->vtbl_slots) return -EINVAL; @@ -435,7 +447,7 @@ return -EROFS; if (lnum < 0 || lnum >= vol->reserved_pebs || len < 0 || - len > vol->usable_leb_size || len % ubi->min_io_size) + len > vol->usable_leb_size || len & (ubi->min_io_size - 1)) return -EINVAL; if (dtype != UBI_LONGTERM && dtype != UBI_SHORTTERM && @@ -448,7 +460,7 @@ if (len == 0) return 0; - return ubi_eba_atomic_leb_change(ubi, vol_id, lnum, buf, len, dtype); + return ubi_eba_atomic_leb_change(ubi, vol, lnum, buf, len, dtype); } EXPORT_SYMBOL_GPL(ubi_leb_change); @@ -468,9 +480,9 @@ { struct ubi_volume *vol = desc->vol; struct ubi_device *ubi = vol->ubi; - int err, vol_id = vol->vol_id; + int err; - dbg_msg("erase LEB %d:%d", vol_id, lnum); + dbg_gen("erase LEB %d:%d", vol->vol_id, lnum); if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME) return -EROFS; @@ -481,7 +493,7 @@ if (vol->upd_marker) return -EBADF; - err = ubi_eba_unmap_leb(ubi, vol_id, lnum); + err = ubi_eba_unmap_leb(ubi, vol, lnum); if (err) return err; @@ -529,9 +541,8 @@ { struct ubi_volume *vol = desc->vol; struct ubi_device *ubi = vol->ubi; - int vol_id = vol->vol_id; - dbg_msg("unmap LEB %d:%d", vol_id, lnum); + dbg_gen("unmap LEB %d:%d", vol->vol_id, lnum); if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME) return -EROFS; @@ -542,11 +553,55 @@ if (vol->upd_marker) return -EBADF; - return ubi_eba_unmap_leb(ubi, vol_id, lnum); + return ubi_eba_unmap_leb(ubi, vol, lnum); } EXPORT_SYMBOL_GPL(ubi_leb_unmap); /** + * ubi_leb_map - map logical erasblock to a physical eraseblock. + * @desc: volume descriptor + * @lnum: logical eraseblock number + * @dtype: expected data type + * + * This function maps an un-mapped logical eraseblock @lnum to a physical + * eraseblock. This means, that after a successfull invocation of this + * function the logical eraseblock @lnum will be empty (contain only %0xFF + * bytes) and be mapped to a physical eraseblock, even if an unclean reboot + * happens. + * + * This function returns zero in case of success, %-EBADF if the volume is + * damaged because of an interrupted update, %-EBADMSG if the logical + * eraseblock is already mapped, and other negative error codes in case of + * other failures. + */ +int ubi_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype) +{ + struct ubi_volume *vol = desc->vol; + struct ubi_device *ubi = vol->ubi; + + dbg_gen("unmap LEB %d:%d", vol->vol_id, lnum); + + if (desc->mode == UBI_READONLY || vol->vol_type == UBI_STATIC_VOLUME) + return -EROFS; + + if (lnum < 0 || lnum >= vol->reserved_pebs) + return -EINVAL; + + if (dtype != UBI_LONGTERM && dtype != UBI_SHORTTERM && + dtype != UBI_UNKNOWN) + return -EINVAL; + + if (vol->upd_marker) + return -EBADF; + + if (vol->eba_tbl[lnum] >= 0) + return -EBADMSG; + + return ubi_eba_write_leb(ubi, vol, lnum, NULL, 0, 0, dtype); +} +EXPORT_SYMBOL_GPL(ubi_leb_map); + +/** * ubi_is_mapped - check if logical eraseblock is mapped. * @desc: volume descriptor * @lnum: logical eraseblock number @@ -566,7 +621,7 @@ { struct ubi_volume *vol = desc->vol; - dbg_msg("test LEB %d:%d", vol->vol_id, lnum); + dbg_gen("test LEB %d:%d", vol->vol_id, lnum); if (lnum < 0 || lnum >= vol->reserved_pebs) return -EINVAL; @@ -577,3 +632,27 @@ return vol->eba_tbl[lnum] >= 0; } EXPORT_SYMBOL_GPL(ubi_is_mapped); + +/** + * ubi_sync - synchronize UBI device buffers. + * @ubi_num: UBI device to synchronize + * + * The underlying MTD device may cache data in hardware or in software. This + * function ensures the caches are flushed. Returns zero in case of success and + * a negative error code in case of failure. + */ +int ubi_sync(int ubi_num) +{ + struct ubi_device *ubi; + + ubi = ubi_get_device(ubi_num); + if (!ubi) + return -ENODEV; + + if (ubi->mtd->sync) + ubi->mtd->sync(ubi->mtd); + + ubi_put_device(ubi); + return 0; +} +EXPORT_SYMBOL_GPL(ubi_sync); diff -Nurd linux-2.6.24/drivers/mtd/ubi/misc.c ubifs-v2.6.24/drivers/mtd/ubi/misc.c --- linux-2.6.24/drivers/mtd/ubi/misc.c 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/misc.c 2009-04-07 17:14:47.000000000 +0200 @@ -37,7 +37,7 @@ { int i; - ubi_assert(length % ubi->min_io_size == 0); + ubi_assert(!(length & (ubi->min_io_size - 1))); for (i = length - 1; i >= 0; i--) if (((const uint8_t *)buf)[i] != 0xFF) @@ -79,7 +79,7 @@ else size = vol->usable_leb_size; - err = ubi_eba_read_leb(ubi, vol_id, i, buf, 0, size, 1); + err = ubi_eba_read_leb(ubi, vol, i, buf, 0, size, 1); if (err) { if (err == -EBADMSG) err = 1; diff -Nurd linux-2.6.24/drivers/mtd/ubi/scan.c ubifs-v2.6.24/drivers/mtd/ubi/scan.c --- linux-2.6.24/drivers/mtd/ubi/scan.c 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/scan.c 2009-04-07 17:14:47.000000000 +0200 @@ -19,9 +19,9 @@ */ /* - * UBI scanning unit. + * UBI scanning sub-system. * - * This unit is responsible for scanning the flash media, checking UBI + * This sub-system is responsible for scanning the flash media, checking UBI * headers and providing complete information about the UBI flash image. * * The scanning information is represented by a &struct ubi_scan_info' object. @@ -92,29 +92,7 @@ } /** - * commit_to_mean_value - commit intermediate results to the final mean erase - * counter value. - * @si: scanning information - * - * This is a helper function which calculates partial mean erase counter mean - * value and adds it to the resulting mean value. As we can work only in - * integer arithmetic and we want to calculate the mean value of erase counter - * accurately, we first sum erase counter values in @si->ec_sum variable and - * count these components in @si->ec_count. If this temporary @si->ec_sum is - * going to overflow, we calculate the partial mean value - * (@si->ec_sum/@si->ec_count) and add it to @si->mean_ec. - */ -static void commit_to_mean_value(struct ubi_scan_info *si) -{ - si->ec_sum /= si->ec_count; - if (si->ec_sum % si->ec_count >= si->ec_count / 2) - si->mean_ec += 1; - si->mean_ec += si->ec_sum; -} - -/** - * validate_vid_hdr - check that volume identifier header is correct and - * consistent. + * validate_vid_hdr - check volume identifier header. * @vid_hdr: the volume identifier header to check * @sv: information about the volume this logical eraseblock belongs to * @pnum: physical eraseblock number the VID header came from @@ -123,7 +101,7 @@ * non-zero if an inconsistency was found and zero if not. * * Note, UBI does sanity check of everything it reads from the flash media. - * Most of the checks are done in the I/O unit. Here we check that the + * Most of the checks are done in the I/O sub-system. Here we check that the * information in the VID header is consistent to the information in other VID * headers of the same volume. */ @@ -267,40 +245,21 @@ struct ubi_vid_hdr *vh = NULL; unsigned long long sqnum2 = be64_to_cpu(vid_hdr->sqnum); - if (seb->sqnum == 0 && sqnum2 == 0) { - long long abs, v1 = seb->leb_ver, v2 = be32_to_cpu(vid_hdr->leb_ver); - + if (sqnum2 == seb->sqnum) { /* - * UBI constantly increases the logical eraseblock version - * number and it can overflow. Thus, we have to bear in mind - * that versions that are close to %0xFFFFFFFF are less then - * versions that are close to %0. - * - * The UBI WL unit guarantees that the number of pending tasks - * is not greater then %0x7FFFFFFF. So, if the difference - * between any two versions is greater or equivalent to - * %0x7FFFFFFF, there was an overflow and the logical - * eraseblock with lower version is actually newer then the one - * with higher version. - * - * FIXME: but this is anyway obsolete and will be removed at - * some point. + * This must be a really ancient UBI image which has been + * created before sequence numbers support has been added. At + * that times we used 32-bit LEB versions stored in logical + * eraseblocks. That was before UBI got into mainline. We do not + * support these images anymore. Well, those images will work + * still work, but only if no unclean reboots happened. */ + ubi_err("unsupported on-flash UBI format\n"); + return -EINVAL; + } - dbg_bld("using old crappy leb_ver stuff"); - - abs = v1 - v2; - if (abs < 0) - abs = -abs; - - if (abs < 0x7FFFFFFF) - /* Non-overflow situation */ - second_is_newer = (v2 > v1); - else - second_is_newer = (v2 < v1); - } else - /* Obviously the LEB with lower sequence counter is older */ - second_is_newer = sqnum2 > seb->sqnum; + /* Obviously the LEB with lower sequence counter is older */ + second_is_newer = !!(sqnum2 > seb->sqnum); /* * Now we know which copy is newer. If the copy flag of the PEB with @@ -308,7 +267,7 @@ * check data CRC. For the second PEB we already have the VID header, * for the first one - we'll need to re-read it from flash. * - * FIXME: this may be optimized so that we wouldn't read twice. + * Note: this may be optimized so that we wouldn't read twice. */ if (second_is_newer) { @@ -360,7 +319,7 @@ } err = ubi_io_read_data(ubi, buf, pnum, 0, len); - if (err && err != UBI_IO_BITFLIPS) + if (err && err != UBI_IO_BITFLIPS && err != -EBADMSG) goto out_free_buf; data_crc = be32_to_cpu(vid_hdr->data_crc); @@ -390,13 +349,11 @@ vfree(buf); out_free_vidh: ubi_free_vid_hdr(ubi, vh); - ubi_assert(err < 0); return err; } /** - * ubi_scan_add_used - add information about a physical eraseblock to the - * scanning information. + * ubi_scan_add_used - add physical eraseblock to the scanning information. * @ubi: UBI device description object * @si: scanning information * @pnum: the physical eraseblock number @@ -416,7 +373,6 @@ int bitflips) { int err, vol_id, lnum; - uint32_t leb_ver; unsigned long long sqnum; struct ubi_scan_volume *sv; struct ubi_scan_leb *seb; @@ -425,13 +381,12 @@ vol_id = be32_to_cpu(vid_hdr->vol_id); lnum = be32_to_cpu(vid_hdr->lnum); sqnum = be64_to_cpu(vid_hdr->sqnum); - leb_ver = be32_to_cpu(vid_hdr->leb_ver); - dbg_bld("PEB %d, LEB %d:%d, EC %d, sqnum %llu, ver %u, bitflips %d", - pnum, vol_id, lnum, ec, sqnum, leb_ver, bitflips); + dbg_bld("PEB %d, LEB %d:%d, EC %d, sqnum %llu, bitflips %d", + pnum, vol_id, lnum, ec, sqnum, bitflips); sv = add_volume(si, vol_id, pnum, vid_hdr); - if (IS_ERR(sv) < 0) + if (IS_ERR(sv)) return PTR_ERR(sv); if (si->max_sqnum < sqnum) @@ -461,25 +416,20 @@ */ dbg_bld("this LEB already exists: PEB %d, sqnum %llu, " - "LEB ver %u, EC %d", seb->pnum, seb->sqnum, - seb->leb_ver, seb->ec); - - /* - * Make sure that the logical eraseblocks have different - * versions. Otherwise the image is bad. - */ - if (seb->leb_ver == leb_ver && leb_ver != 0) { - ubi_err("two LEBs with same version %u", leb_ver); - ubi_dbg_dump_seb(seb, 0); - ubi_dbg_dump_vid_hdr(vid_hdr); - return -EINVAL; - } + "EC %d", seb->pnum, seb->sqnum, seb->ec); /* * Make sure that the logical eraseblocks have different * sequence numbers. Otherwise the image is bad. * - * FIXME: remove 'sqnum != 0' check when leb_ver is removed. + * However, if the sequence number is zero, we assume it must + * be an ancient UBI image from the era when UBI did not have + * sequence numbers. We still can attach these images, unless + * there is a need to distinguish between old and new + * eraseblocks, in which case we'll refuse the image in + * 'compare_lebs()'. In other words, we attach old clean + * images, but refuse attaching old images with duplicated + * logical eraseblocks because there was an unclean reboot. */ if (seb->sqnum == sqnum && sqnum != 0) { ubi_err("two LEBs with same sequence number %llu", @@ -519,7 +469,6 @@ seb->pnum = pnum; seb->scrub = ((cmp_res & 2) || bitflips); seb->sqnum = sqnum; - seb->leb_ver = leb_ver; if (sv->highest_lnum == lnum) sv->last_data_size = @@ -556,7 +505,6 @@ seb->lnum = lnum; seb->sqnum = sqnum; seb->scrub = bitflips; - seb->leb_ver = leb_ver; if (sv->highest_lnum <= lnum) { sv->highest_lnum = lnum; @@ -570,8 +518,7 @@ } /** - * ubi_scan_find_sv - find information about a particular volume in the - * scanning information. + * ubi_scan_find_sv - find volume in the scanning information. * @si: scanning information * @vol_id: the requested volume ID * @@ -600,8 +547,7 @@ } /** - * ubi_scan_find_seb - find information about a particular logical - * eraseblock in the volume scanning information. + * ubi_scan_find_seb - find LEB in the volume scanning information. * @sv: a pointer to the volume scanning information * @lnum: the requested logical eraseblock * @@ -661,9 +607,9 @@ * * This function erases physical eraseblock 'pnum', and writes the erase * counter header to it. This function should only be used on UBI device - * initialization stages, when the EBA unit had not been yet initialized. This - * function returns zero in case of success and a negative error code in case - * of failure. + * initialization stages, when the EBA sub-system had not been yet initialized. + * This function returns zero in case of success and a negative error code in + * case of failure. */ int ubi_scan_erase_peb(struct ubi_device *ubi, const struct ubi_scan_info *si, int pnum, int ec) @@ -703,9 +649,10 @@ * @si: scanning information * * This function returns a free physical eraseblock. It is supposed to be - * called on the UBI initialization stages when the wear-leveling unit is not - * initialized yet. This function picks a physical eraseblocks from one of the - * lists, writes the EC header if it is needed, and removes it from the list. + * called on the UBI initialization stages when the wear-leveling sub-system is + * not initialized yet. This function picks a physical eraseblocks from one of + * the lists, writes the EC header if it is needed, and removes it from the + * list. * * This function returns scanning physical eraseblock information in case of * success and an error code in case of failure. @@ -758,8 +705,7 @@ } /** - * process_eb - read UBI headers, check them and add corresponding data - * to the scanning information. + * process_eb - read, check UBI headers, and add them to scanning information. * @ubi: UBI device description object * @si: scanning information * @pnum: the physical eraseblock number @@ -767,9 +713,10 @@ * This function returns a zero if the physical eraseblock was successfully * handled and a negative error code in case of failure. */ -static int process_eb(struct ubi_device *ubi, struct ubi_scan_info *si, int pnum) +static int process_eb(struct ubi_device *ubi, struct ubi_scan_info *si, + int pnum) { - long long ec; + long long uninitialized_var(ec); int err, bitflips = 0, vol_id, ec_corr = 0; dbg_bld("scan PEB %d", pnum); @@ -780,8 +727,9 @@ return err; else if (err) { /* - * FIXME: this is actually duty of the I/O unit to initialize - * this, but MTD does not provide enough information. + * FIXME: this is actually duty of the I/O sub-system to + * initialize this, but MTD does not provide enough + * information. */ si->bad_peb_count += 1; return 0; @@ -854,7 +802,7 @@ } vol_id = be32_to_cpu(vidh->vol_id); - if (vol_id > UBI_MAX_VOLUMES && vol_id != UBI_LAYOUT_VOL_ID) { + if (vol_id > UBI_MAX_VOLUMES && vol_id != UBI_LAYOUT_VOLUME_ID) { int lnum = be32_to_cpu(vidh->lnum); /* Unsupported internal volume */ @@ -897,15 +845,8 @@ adjust_mean_ec: if (!ec_corr) { - if (si->ec_sum + ec < ec) { - commit_to_mean_value(si); - si->ec_sum = 0; - si->ec_count = 0; - } else { - si->ec_sum += ec; - si->ec_count += 1; - } - + si->ec_sum += ec; + si->ec_count += 1; if (ec > si->max_ec) si->max_ec = ec; if (ec < si->min_ec) @@ -953,7 +894,7 @@ for (pnum = 0; pnum < ubi->peb_count; pnum++) { cond_resched(); - dbg_msg("process PEB %d", pnum); + dbg_gen("process PEB %d", pnum); err = process_eb(ubi, si, pnum); if (err < 0) goto out_vidh; @@ -961,9 +902,9 @@ dbg_msg("scanning is finished"); - /* Finish mean erase counter calculations */ + /* Calculate mean erase counter */ if (si->ec_count) - commit_to_mean_value(si); + si->mean_ec = div_u64(si->ec_sum, si->ec_count); if (si->is_empty) ubi_msg("empty MTD device detected"); @@ -1100,8 +1041,7 @@ #ifdef CONFIG_MTD_UBI_DEBUG_PARANOID /** - * paranoid_check_si - check if the scanning information is correct and - * consistent. + * paranoid_check_si - check the scanning information. * @ubi: UBI device description object * @si: scanning information * @@ -1286,11 +1226,6 @@ ubi_err("bad data_pad %d", sv->data_pad); goto bad_vid_hdr; } - - if (seb->leb_ver != be32_to_cpu(vidh->leb_ver)) { - ubi_err("bad leb_ver %u", seb->leb_ver); - goto bad_vid_hdr; - } } if (!last_seb) @@ -1320,8 +1255,7 @@ if (err < 0) { kfree(buf); return err; - } - else if (err) + } else if (err) buf[pnum] = 1; } diff -Nurd linux-2.6.24/drivers/mtd/ubi/scan.h ubifs-v2.6.24/drivers/mtd/ubi/scan.h --- linux-2.6.24/drivers/mtd/ubi/scan.h 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/scan.h 2009-04-07 17:14:47.000000000 +0200 @@ -34,7 +34,6 @@ * @u: unions RB-tree or @list links * @u.rb: link in the per-volume RB-tree of &struct ubi_scan_leb objects * @u.list: link in one of the eraseblock lists - * @leb_ver: logical eraseblock version (obsolete) * * One object of this type is allocated for each physical eraseblock during * scanning. @@ -49,7 +48,6 @@ struct rb_node rb; struct list_head list; } u; - uint32_t leb_ver; }; /** @@ -59,16 +57,16 @@ * @leb_count: number of logical eraseblocks in this volume * @vol_type: volume type * @used_ebs: number of used logical eraseblocks in this volume (only for - * static volumes) + * static volumes) * @last_data_size: amount of data in the last logical eraseblock of this - * volume (always equivalent to the usable logical eraseblock size in case of - * dynamic volumes) + * volume (always equivalent to the usable logical eraseblock + * size in case of dynamic volumes) * @data_pad: how many bytes at the end of logical eraseblocks of this volume - * are not used (due to volume alignment) + * are not used (due to volume alignment) * @compat: compatibility flags of this volume * @rb: link in the volume RB-tree * @root: root of the RB-tree containing all the eraseblock belonging to this - * volume (&struct ubi_scan_leb objects) + * volume (&struct ubi_scan_leb objects) * * One object of this type is allocated for each volume during scanning. */ @@ -92,8 +90,8 @@ * @free: list of free physical eraseblocks * @erase: list of physical eraseblocks which have to be erased * @alien: list of physical eraseblocks which should not be used by UBI (e.g., + * those belonging to "preserve"-compatible internal volumes) * @bad_peb_count: count of bad physical eraseblocks - * those belonging to "preserve"-compatible internal volumes) * @vols_found: number of volumes found during scanning * @highest_vol_id: highest volume ID * @alien_peb_count: count of physical eraseblocks in the @alien list @@ -106,8 +104,8 @@ * @ec_count: a temporary variable used when calculating @mean_ec * * This data structure contains the result of scanning and may be used by other - * UBI units to build final UBI data structures, further error-recovery and so - * on. + * UBI sub-systems to build final UBI data structures, further error-recovery + * and so on. */ struct ubi_scan_info { struct rb_root volumes; @@ -124,7 +122,7 @@ int max_ec; unsigned long long max_sqnum; int mean_ec; - int ec_sum; + uint64_t ec_sum; int ec_count; }; @@ -132,8 +130,7 @@ struct ubi_vid_hdr; /* - * ubi_scan_move_to_list - move a physical eraseblock from the volume tree to a - * list. + * ubi_scan_move_to_list - move a PEB from the volume tree to a list. * * @sv: volume scanning information * @seb: scanning eraseblock infprmation diff -Nurd linux-2.6.24/drivers/mtd/ubi/ubi-media.h ubifs-v2.6.24/drivers/mtd/ubi/ubi-media.h --- linux-2.6.24/drivers/mtd/ubi/ubi-media.h 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/ubi-media.h 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,368 @@ +/* + * Copyright (c) International Business Machines Corp., 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Thomas Gleixner + * Frank Haverkamp + * Oliver Lohmann + * Andreas Arnez + */ + +/* + * This file defines the layout of UBI headers and all the other UBI on-flash + * data structures. + */ + +#ifndef __UBI_MEDIA_H__ +#define __UBI_MEDIA_H__ + +#include + +/* The version of UBI images supported by this implementation */ +#define UBI_VERSION 1 + +/* The highest erase counter value supported by this implementation */ +#define UBI_MAX_ERASECOUNTER 0x7FFFFFFF + +/* The initial CRC32 value used when calculating CRC checksums */ +#define UBI_CRC32_INIT 0xFFFFFFFFU + +/* Erase counter header magic number (ASCII "UBI#") */ +#define UBI_EC_HDR_MAGIC 0x55424923 +/* Volume identifier header magic number (ASCII "UBI!") */ +#define UBI_VID_HDR_MAGIC 0x55424921 + +/* + * Volume type constants used in the volume identifier header. + * + * @UBI_VID_DYNAMIC: dynamic volume + * @UBI_VID_STATIC: static volume + */ +enum { + UBI_VID_DYNAMIC = 1, + UBI_VID_STATIC = 2 +}; + +/* + * Volume flags used in the volume table record. + * + * @UBI_VTBL_AUTORESIZE_FLG: auto-resize this volume + * + * %UBI_VTBL_AUTORESIZE_FLG flag can be set only for one volume in the volume + * table. UBI automatically re-sizes the volume which has this flag and makes + * the volume to be of largest possible size. This means that if after the + * initialization UBI finds out that there are available physical eraseblocks + * present on the device, it automatically appends all of them to the volume + * (the physical eraseblocks reserved for bad eraseblocks handling and other + * reserved physical eraseblocks are not taken). So, if there is a volume with + * the %UBI_VTBL_AUTORESIZE_FLG flag set, the amount of available logical + * eraseblocks will be zero after UBI is loaded, because all of them will be + * reserved for this volume. Note, the %UBI_VTBL_AUTORESIZE_FLG bit is cleared + * after the volume had been initialized. + * + * The auto-resize feature is useful for device production purposes. For + * example, different NAND flash chips may have different amount of initial bad + * eraseblocks, depending of particular chip instance. Manufacturers of NAND + * chips usually guarantee that the amount of initial bad eraseblocks does not + * exceed certain percent, e.g. 2%. When one creates an UBI image which will be + * flashed to the end devices in production, he does not know the exact amount + * of good physical eraseblocks the NAND chip on the device will have, but this + * number is required to calculate the volume sized and put them to the volume + * table of the UBI image. In this case, one of the volumes (e.g., the one + * which will store the root file system) is marked as "auto-resizable", and + * UBI will adjust its size on the first boot if needed. + * + * Note, first UBI reserves some amount of physical eraseblocks for bad + * eraseblock handling, and then re-sizes the volume, not vice-versa. This + * means that the pool of reserved physical eraseblocks will always be present. + */ +enum { + UBI_VTBL_AUTORESIZE_FLG = 0x01, +}; + +/* + * Compatibility constants used by internal volumes. + * + * @UBI_COMPAT_DELETE: delete this internal volume before anything is written + * to the flash + * @UBI_COMPAT_RO: attach this device in read-only mode + * @UBI_COMPAT_PRESERVE: preserve this internal volume - do not touch its + * physical eraseblocks, don't allow the wear-leveling + * sub-system to move them + * @UBI_COMPAT_REJECT: reject this UBI image + */ +enum { + UBI_COMPAT_DELETE = 1, + UBI_COMPAT_RO = 2, + UBI_COMPAT_PRESERVE = 4, + UBI_COMPAT_REJECT = 5 +}; + +/* Sizes of UBI headers */ +#define UBI_EC_HDR_SIZE sizeof(struct ubi_ec_hdr) +#define UBI_VID_HDR_SIZE sizeof(struct ubi_vid_hdr) + +/* Sizes of UBI headers without the ending CRC */ +#define UBI_EC_HDR_SIZE_CRC (UBI_EC_HDR_SIZE - sizeof(__be32)) +#define UBI_VID_HDR_SIZE_CRC (UBI_VID_HDR_SIZE - sizeof(__be32)) + +/** + * struct ubi_ec_hdr - UBI erase counter header. + * @magic: erase counter header magic number (%UBI_EC_HDR_MAGIC) + * @version: version of UBI implementation which is supposed to accept this + * UBI image + * @padding1: reserved for future, zeroes + * @ec: the erase counter + * @vid_hdr_offset: where the VID header starts + * @data_offset: where the user data start + * @padding2: reserved for future, zeroes + * @hdr_crc: erase counter header CRC checksum + * + * The erase counter header takes 64 bytes and has a plenty of unused space for + * future usage. The unused fields are zeroed. The @version field is used to + * indicate the version of UBI implementation which is supposed to be able to + * work with this UBI image. If @version is greater then the current UBI + * version, the image is rejected. This may be useful in future if something + * is changed radically. This field is duplicated in the volume identifier + * header. + * + * The @vid_hdr_offset and @data_offset fields contain the offset of the the + * volume identifier header and user data, relative to the beginning of the + * physical eraseblock. These values have to be the same for all physical + * eraseblocks. + */ +struct ubi_ec_hdr { + __be32 magic; + __u8 version; + __u8 padding1[3]; + __be64 ec; /* Warning: the current limit is 31-bit anyway! */ + __be32 vid_hdr_offset; + __be32 data_offset; + __u8 padding2[36]; + __be32 hdr_crc; +} __attribute__ ((packed)); + +/** + * struct ubi_vid_hdr - on-flash UBI volume identifier header. + * @magic: volume identifier header magic number (%UBI_VID_HDR_MAGIC) + * @version: UBI implementation version which is supposed to accept this UBI + * image (%UBI_VERSION) + * @vol_type: volume type (%UBI_VID_DYNAMIC or %UBI_VID_STATIC) + * @copy_flag: if this logical eraseblock was copied from another physical + * eraseblock (for wear-leveling reasons) + * @compat: compatibility of this volume (%0, %UBI_COMPAT_DELETE, + * %UBI_COMPAT_IGNORE, %UBI_COMPAT_PRESERVE, or %UBI_COMPAT_REJECT) + * @vol_id: ID of this volume + * @lnum: logical eraseblock number + * @padding1: reserved for future, zeroes + * @data_size: how many bytes of data this logical eraseblock contains + * @used_ebs: total number of used logical eraseblocks in this volume + * @data_pad: how many bytes at the end of this physical eraseblock are not + * used + * @data_crc: CRC checksum of the data stored in this logical eraseblock + * @padding2: reserved for future, zeroes + * @sqnum: sequence number + * @padding3: reserved for future, zeroes + * @hdr_crc: volume identifier header CRC checksum + * + * The @sqnum is the value of the global sequence counter at the time when this + * VID header was created. The global sequence counter is incremented each time + * UBI writes a new VID header to the flash, i.e. when it maps a logical + * eraseblock to a new physical eraseblock. The global sequence counter is an + * unsigned 64-bit integer and we assume it never overflows. The @sqnum + * (sequence number) is used to distinguish between older and newer versions of + * logical eraseblocks. + * + * There are 2 situations when there may be more then one physical eraseblock + * corresponding to the same logical eraseblock, i.e., having the same @vol_id + * and @lnum values in the volume identifier header. Suppose we have a logical + * eraseblock L and it is mapped to the physical eraseblock P. + * + * 1. Because UBI may erase physical eraseblocks asynchronously, the following + * situation is possible: L is asynchronously erased, so P is scheduled for + * erasure, then L is written to,i.e. mapped to another physical eraseblock P1, + * so P1 is written to, then an unclean reboot happens. Result - there are 2 + * physical eraseblocks P and P1 corresponding to the same logical eraseblock + * L. But P1 has greater sequence number, so UBI picks P1 when it attaches the + * flash. + * + * 2. From time to time UBI moves logical eraseblocks to other physical + * eraseblocks for wear-leveling reasons. If, for example, UBI moves L from P + * to P1, and an unclean reboot happens before P is physically erased, there + * are two physical eraseblocks P and P1 corresponding to L and UBI has to + * select one of them when the flash is attached. The @sqnum field says which + * PEB is the original (obviously P will have lower @sqnum) and the copy. But + * it is not enough to select the physical eraseblock with the higher sequence + * number, because the unclean reboot could have happen in the middle of the + * copying process, so the data in P is corrupted. It is also not enough to + * just select the physical eraseblock with lower sequence number, because the + * data there may be old (consider a case if more data was added to P1 after + * the copying). Moreover, the unclean reboot may happen when the erasure of P + * was just started, so it result in unstable P, which is "mostly" OK, but + * still has unstable bits. + * + * UBI uses the @copy_flag field to indicate that this logical eraseblock is a + * copy. UBI also calculates data CRC when the data is moved and stores it at + * the @data_crc field of the copy (P1). So when UBI needs to pick one physical + * eraseblock of two (P or P1), the @copy_flag of the newer one (P1) is + * examined. If it is cleared, the situation* is simple and the newer one is + * picked. If it is set, the data CRC of the copy (P1) is examined. If the CRC + * checksum is correct, this physical eraseblock is selected (P1). Otherwise + * the older one (P) is selected. + * + * There are 2 sorts of volumes in UBI: user volumes and internal volumes. + * Internal volumes are not seen from outside and are used for various internal + * UBI purposes. In this implementation there is only one internal volume - the + * layout volume. Internal volumes are the main mechanism of UBI extensions. + * For example, in future one may introduce a journal internal volume. Internal + * volumes have their own reserved range of IDs. + * + * The @compat field is only used for internal volumes and contains the "degree + * of their compatibility". It is always zero for user volumes. This field + * provides a mechanism to introduce UBI extensions and to be still compatible + * with older UBI binaries. For example, if someone introduced a journal in + * future, he would probably use %UBI_COMPAT_DELETE compatibility for the + * journal volume. And in this case, older UBI binaries, which know nothing + * about the journal volume, would just delete this volume and work perfectly + * fine. This is similar to what Ext2fs does when it is fed by an Ext3fs image + * - it just ignores the Ext3fs journal. + * + * The @data_crc field contains the CRC checksum of the contents of the logical + * eraseblock if this is a static volume. In case of dynamic volumes, it does + * not contain the CRC checksum as a rule. The only exception is when the + * data of the physical eraseblock was moved by the wear-leveling sub-system, + * then the wear-leveling sub-system calculates the data CRC and stores it in + * the @data_crc field. And of course, the @copy_flag is %in this case. + * + * The @data_size field is used only for static volumes because UBI has to know + * how many bytes of data are stored in this eraseblock. For dynamic volumes, + * this field usually contains zero. The only exception is when the data of the + * physical eraseblock was moved to another physical eraseblock for + * wear-leveling reasons. In this case, UBI calculates CRC checksum of the + * contents and uses both @data_crc and @data_size fields. In this case, the + * @data_size field contains data size. + * + * The @used_ebs field is used only for static volumes and indicates how many + * eraseblocks the data of the volume takes. For dynamic volumes this field is + * not used and always contains zero. + * + * The @data_pad is calculated when volumes are created using the alignment + * parameter. So, effectively, the @data_pad field reduces the size of logical + * eraseblocks of this volume. This is very handy when one uses block-oriented + * software (say, cramfs) on top of the UBI volume. + */ +struct ubi_vid_hdr { + __be32 magic; + __u8 version; + __u8 vol_type; + __u8 copy_flag; + __u8 compat; + __be32 vol_id; + __be32 lnum; + __u8 padding1[4]; + __be32 data_size; + __be32 used_ebs; + __be32 data_pad; + __be32 data_crc; + __u8 padding2[4]; + __be64 sqnum; + __u8 padding3[12]; + __be32 hdr_crc; +} __attribute__ ((packed)); + +/* Internal UBI volumes count */ +#define UBI_INT_VOL_COUNT 1 + +/* + * Starting ID of internal volumes. There is reserved room for 4096 internal + * volumes. + */ +#define UBI_INTERNAL_VOL_START (0x7FFFFFFF - 4096) + +/* The layout volume contains the volume table */ + +#define UBI_LAYOUT_VOLUME_ID UBI_INTERNAL_VOL_START +#define UBI_LAYOUT_VOLUME_TYPE UBI_VID_DYNAMIC +#define UBI_LAYOUT_VOLUME_ALIGN 1 +#define UBI_LAYOUT_VOLUME_EBS 2 +#define UBI_LAYOUT_VOLUME_NAME "layout volume" +#define UBI_LAYOUT_VOLUME_COMPAT UBI_COMPAT_REJECT + +/* The maximum number of volumes per one UBI device */ +#define UBI_MAX_VOLUMES 128 + +/* The maximum volume name length */ +#define UBI_VOL_NAME_MAX 127 + +/* Size of the volume table record */ +#define UBI_VTBL_RECORD_SIZE sizeof(struct ubi_vtbl_record) + +/* Size of the volume table record without the ending CRC */ +#define UBI_VTBL_RECORD_SIZE_CRC (UBI_VTBL_RECORD_SIZE - sizeof(__be32)) + +/** + * struct ubi_vtbl_record - a record in the volume table. + * @reserved_pebs: how many physical eraseblocks are reserved for this volume + * @alignment: volume alignment + * @data_pad: how many bytes are unused at the end of the each physical + * eraseblock to satisfy the requested alignment + * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME) + * @upd_marker: if volume update was started but not finished + * @name_len: volume name length + * @name: the volume name + * @flags: volume flags (%UBI_VTBL_AUTORESIZE_FLG) + * @padding: reserved, zeroes + * @crc: a CRC32 checksum of the record + * + * The volume table records are stored in the volume table, which is stored in + * the layout volume. The layout volume consists of 2 logical eraseblock, each + * of which contains a copy of the volume table (i.e., the volume table is + * duplicated). The volume table is an array of &struct ubi_vtbl_record + * objects indexed by the volume ID. + * + * If the size of the logical eraseblock is large enough to fit + * %UBI_MAX_VOLUMES records, the volume table contains %UBI_MAX_VOLUMES + * records. Otherwise, it contains as many records as it can fit (i.e., size of + * logical eraseblock divided by sizeof(struct ubi_vtbl_record)). + * + * The @upd_marker flag is used to implement volume update. It is set to %1 + * before update and set to %0 after the update. So if the update operation was + * interrupted, UBI knows that the volume is corrupted. + * + * The @alignment field is specified when the volume is created and cannot be + * later changed. It may be useful, for example, when a block-oriented file + * system works on top of UBI. The @data_pad field is calculated using the + * logical eraseblock size and @alignment. The alignment must be multiple to the + * minimal flash I/O unit. If @alignment is 1, all the available space of + * the physical eraseblocks is used. + * + * Empty records contain all zeroes and the CRC checksum of those zeroes. + */ +struct ubi_vtbl_record { + __be32 reserved_pebs; + __be32 alignment; + __be32 data_pad; + __u8 vol_type; + __u8 upd_marker; + __be16 name_len; + __u8 name[UBI_VOL_NAME_MAX+1]; + __u8 flags; + __u8 padding[23]; + __be32 crc; +} __attribute__ ((packed)); + +#endif /* !__UBI_MEDIA_H__ */ diff -Nurd linux-2.6.24/drivers/mtd/ubi/ubi.h ubifs-v2.6.24/drivers/mtd/ubi/ubi.h --- linux-2.6.24/drivers/mtd/ubi/ubi.h 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/ubi.h 2009-04-07 17:14:47.000000000 +0200 @@ -37,10 +37,9 @@ #include #include #include - -#include #include +#include "ubi-media.h" #include "scan.h" #include "debug.h" @@ -75,15 +74,22 @@ #define UBI_IO_RETRIES 3 /* - * Error codes returned by the I/O unit. + * Length of the protection queue. The length is effectively equivalent to the + * number of (global) erase cycles PEBs are protected from the wear-leveling + * worker. + */ +#define UBI_PROT_QUEUE_LEN 10 + +/* + * Error codes returned by the I/O sub-system. * * UBI_IO_PEB_EMPTY: the physical eraseblock is empty, i.e. it contains only - * 0xFF bytes + * %0xFF bytes * UBI_IO_PEB_FREE: the physical eraseblock is free, i.e. it contains only a - * valid erase counter header, and the rest are %0xFF bytes + * valid erase counter header, and the rest are %0xFF bytes * UBI_IO_BAD_EC_HDR: the erase counter header is corrupted (bad magic or CRC) * UBI_IO_BAD_VID_HDR: the volume identifier header is corrupted (bad magic or - * CRC) + * CRC) * UBI_IO_BITFLIPS: bit-flips were detected and corrected */ enum { @@ -94,8 +100,68 @@ UBI_IO_BITFLIPS }; -extern int ubi_devices_cnt; -extern struct ubi_device *ubi_devices[]; +/** + * struct ubi_wl_entry - wear-leveling entry. + * @u.rb: link in the corresponding (free/used) RB-tree + * @u.list: link in the protection queue + * @ec: erase counter + * @pnum: physical eraseblock number + * + * This data structure is used in the WL sub-system. Each physical eraseblock + * has a corresponding &struct wl_entry object which may be kept in different + * RB-trees. See WL sub-system for details. + */ +struct ubi_wl_entry { + union { + struct rb_node rb; + struct list_head list; + } u; + int ec; + int pnum; +}; + +/** + * struct ubi_ltree_entry - an entry in the lock tree. + * @rb: links RB-tree nodes + * @vol_id: volume ID of the locked logical eraseblock + * @lnum: locked logical eraseblock number + * @users: how many tasks are using this logical eraseblock or wait for it + * @mutex: read/write mutex to implement read/write access serialization to + * the (@vol_id, @lnum) logical eraseblock + * + * This data structure is used in the EBA sub-system to implement per-LEB + * locking. When a logical eraseblock is being locked - corresponding + * &struct ubi_ltree_entry object is inserted to the lock tree (@ubi->ltree). + * See EBA sub-system for details. + */ +struct ubi_ltree_entry { + struct rb_node rb; + int vol_id; + int lnum; + int users; + struct rw_semaphore mutex; +}; + +/** + * struct ubi_rename_entry - volume re-name description data structure. + * @new_name_len: new volume name length + * @new_name: new volume name + * @remove: if not zero, this volume should be removed, not re-named + * @desc: descriptor of the volume + * @list: links re-name entries into a list + * + * This data structure is utilized in the multiple volume re-name code. Namely, + * UBI first creates a list of &struct ubi_rename_entry objects from the + * &struct ubi_rnvol_req request object, and then utilizes this list to do all + * the job. + */ +struct ubi_rename_entry { + int new_name_len; + char new_name[UBI_VOL_NAME_MAX + 1]; + int remove; + struct ubi_volume_desc *desc; + struct list_head list; +}; struct ubi_volume_desc; @@ -105,11 +171,10 @@ * @cdev: character device object to create character device * @ubi: reference to the UBI device description object * @vol_id: volume ID + * @ref_count: volume reference count * @readers: number of users holding this volume in read-only mode * @writers: number of users holding this volume in read-write mode * @exclusive: whether somebody holds this volume in exclusive mode - * @removed: if the volume was removed - * @checked: if this static volume was checked * * @reserved_pebs: how many physical eraseblocks are reserved for this volume * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME) @@ -117,21 +182,31 @@ * @used_ebs: how many logical eraseblocks in this volume contain data * @last_eb_bytes: how many bytes are stored in the last logical eraseblock * @used_bytes: how many bytes of data this volume contains - * @upd_marker: non-zero if the update marker is set for this volume - * @corrupted: non-zero if the volume is corrupted (static volumes only) * @alignment: volume alignment * @data_pad: how many bytes are not used at the end of physical eraseblocks to - * satisfy the requested alignment + * satisfy the requested alignment * @name_len: volume name length * @name: volume name * - * @updating: whether the volume is being updated * @upd_ebs: how many eraseblocks are expected to be updated - * @upd_bytes: how many bytes are expected to be received - * @upd_received: how many update bytes were already received - * @upd_buf: update buffer which is used to collect update data + * @ch_lnum: LEB number which is being changing by the atomic LEB change + * operation + * @ch_dtype: data persistency type which is being changing by the atomic LEB + * change operation + * @upd_bytes: how many bytes are expected to be received for volume update or + * atomic LEB change + * @upd_received: how many bytes were already received for volume update or + * atomic LEB change + * @upd_buf: update buffer which is used to collect update data or data for + * atomic LEB change * * @eba_tbl: EBA table of this volume (LEB->PEB mapping) + * @checked: %1 if this static volume was checked + * @corrupted: %1 if the volume is corrupted (static volumes only) + * @upd_marker: %1 if the update marker is set for this volume + * @updating: %1 if the volume is being updated + * @changing_leb: %1 if the atomic LEB change ioctl command is in progress + * @direct_writes: %1 if direct writes are enabled for this volume * * @gluebi_desc: gluebi UBI volume descriptor * @gluebi_refcount: reference count of the gluebi MTD device @@ -150,11 +225,10 @@ struct cdev cdev; struct ubi_device *ubi; int vol_id; + int ref_count; int readers; int writers; int exclusive; - int removed; - int checked; int reserved_pebs; int vol_type; @@ -162,23 +236,32 @@ int used_ebs; int last_eb_bytes; long long used_bytes; - int upd_marker; - int corrupted; int alignment; int data_pad; int name_len; - char name[UBI_VOL_NAME_MAX+1]; + char name[UBI_VOL_NAME_MAX + 1]; - int updating; int upd_ebs; + int ch_lnum; + int ch_dtype; long long upd_bytes; long long upd_received; void *upd_buf; int *eba_tbl; + unsigned int checked:1; + unsigned int corrupted:1; + unsigned int upd_marker:1; + unsigned int updating:1; + unsigned int changing_leb:1; + unsigned int direct_writes:1; #ifdef CONFIG_MTD_UBI_GLUEBI - /* Gluebi-related stuff may be compiled out */ + /* + * Gluebi-related stuff may be compiled out. + * Note: this should not be built into UBI but should be a separate + * ubimtd driver which works on top of UBI and emulates MTD devices. + */ struct ubi_volume_desc *gluebi_desc; int gluebi_refcount; struct mtd_info gluebi_mtd; @@ -186,8 +269,7 @@ }; /** - * struct ubi_volume_desc - descriptor of the UBI volume returned when it is - * opened. + * struct ubi_volume_desc - UBI volume descriptor returned when it is opened. * @vol: reference to the corresponding volume description object * @mode: open mode (%UBI_READONLY, %UBI_READWRITE, or %UBI_EXCLUSIVE) */ @@ -200,28 +282,32 @@ /** * struct ubi_device - UBI device description structure - * @dev: class device object to use the the Linux device model + * @dev: UBI device object to use the the Linux device model * @cdev: character device object to create character device * @ubi_num: UBI device number * @ubi_name: UBI device name - * @major: character device major number * @vol_count: number of volumes in this UBI device * @volumes: volumes of this UBI device * @volumes_lock: protects @volumes, @rsvd_pebs, @avail_pebs, beb_rsvd_pebs, - * @beb_rsvd_level, @bad_peb_count, @good_peb_count, @vol_count, @vol->readers, - * @vol->writers, @vol->exclusive, @vol->removed, @vol->mapping and - * @vol->eba_tbl. + * @beb_rsvd_level, @bad_peb_count, @good_peb_count, @vol_count, + * @vol->readers, @vol->writers, @vol->exclusive, + * @vol->ref_count, @vol->mapping and @vol->eba_tbl. + * @ref_count: count of references on the UBI device * * @rsvd_pebs: count of reserved physical eraseblocks * @avail_pebs: count of available physical eraseblocks * @beb_rsvd_pebs: how many physical eraseblocks are reserved for bad PEB - * handling + * handling * @beb_rsvd_level: normal level of PEBs reserved for bad PEB handling * + * @autoresize_vol_id: ID of the volume which has to be auto-resized at the end + * of UBI initialization * @vtbl_slots: how many slots are available in the volume table * @vtbl_size: size of the volume table in bytes * @vtbl: in-RAM volume table copy - * @vtbl_mutex: protects on-flash volume table + * @volumes_mutex: protects on-flash volume table and serializes volume + * changes, like creation, deletion, update, re-size, + * re-name and set property * * @max_ec: current highest erase counter value * @mean_ec: current mean erase counter value @@ -234,19 +320,19 @@ * @used: RB-tree of used physical eraseblocks * @free: RB-tree of free physical eraseblocks * @scrub: RB-tree of physical eraseblocks which need scrubbing - * @prot: protection trees - * @prot.pnum: protection tree indexed by physical eraseblock numbers - * @prot.aec: protection tree indexed by absolute erase counter value - * @wl_lock: protects the @used, @free, @prot, @lookuptbl, @abs_ec, @move_from, - * @move_to, @move_to_put @erase_pending, @wl_scheduled, and @works - * fields + * @pq: protection queue (contain physical eraseblocks which are temporarily + * protected from the wear-leveling worker) + * @pq_head: protection queue head + * @wl_lock: protects the @used, @free, @pq, @pq_head, @lookuptbl, @move_from, + * @move_to, @move_to_put @erase_pending, @wl_scheduled and @works + * fields + * @move_mutex: serializes eraseblock moves + * @work_sem: synchronizes the WL worker with use tasks * @wl_scheduled: non-zero if the wear-leveling was scheduled * @lookuptbl: a table to quickly find a &struct ubi_wl_entry object for any - * physical eraseblock - * @abs_ec: absolute erase counter + * physical eraseblock * @move_from: physical eraseblock from where the data is being moved * @move_to: physical eraseblock where the data is being moved to - * @move_from_put: if the "from" PEB was put * @move_to_put: if the "to" PEB was put * @works: list of pending works * @works_count: count of pending works @@ -264,68 +350,70 @@ * @ro_mode: if the UBI device is in read-only mode * @leb_size: logical eraseblock size * @leb_start: starting offset of logical eraseblocks within physical - * eraseblocks + * eraseblocks * @ec_hdr_alsize: size of the EC header aligned to @hdrs_min_io_size * @vid_hdr_alsize: size of the VID header aligned to @hdrs_min_io_size * @vid_hdr_offset: starting offset of the volume identifier header (might be - * unaligned) + * unaligned) * @vid_hdr_aloffset: starting offset of the VID header aligned to * @hdrs_min_io_size * @vid_hdr_shift: contains @vid_hdr_offset - @vid_hdr_aloffset * @bad_allowed: whether the MTD device admits of bad physical eraseblocks or - * not + * not * @mtd: MTD device descriptor * * @peb_buf1: a buffer of PEB size used for different purposes * @peb_buf2: another buffer of PEB size used for different purposes - * @buf_mutex: proptects @peb_buf1 and @peb_buf2 - * @dbg_peb_buf: buffer of PEB size used for debugging - * @dbg_buf_mutex: proptects @dbg_peb_buf + * @buf_mutex: protects @peb_buf1 and @peb_buf2 + * @ckvol_mutex: serializes static volume checking when opening + * @mult_mutex: serializes operations on multiple volumes, like re-naming + * @dbg_peb_buf: buffer of PEB size used for debugging + * @dbg_buf_mutex: protects @dbg_peb_buf */ struct ubi_device { struct cdev cdev; struct device dev; int ubi_num; char ubi_name[sizeof(UBI_NAME_STR)+5]; - int major; int vol_count; struct ubi_volume *volumes[UBI_MAX_VOLUMES+UBI_INT_VOL_COUNT]; spinlock_t volumes_lock; + int ref_count; int rsvd_pebs; int avail_pebs; int beb_rsvd_pebs; int beb_rsvd_level; + int autoresize_vol_id; int vtbl_slots; int vtbl_size; struct ubi_vtbl_record *vtbl; - struct mutex vtbl_mutex; + struct mutex volumes_mutex; int max_ec; + /* Note, mean_ec is not updated run-time - should be fixed */ int mean_ec; - /* EBA unit's stuff */ + /* EBA sub-system's stuff */ unsigned long long global_sqnum; spinlock_t ltree_lock; struct rb_root ltree; struct mutex alc_mutex; - /* Wear-leveling unit's stuff */ + /* Wear-leveling sub-system's stuff */ struct rb_root used; struct rb_root free; struct rb_root scrub; - struct { - struct rb_root pnum; - struct rb_root aec; - } prot; + struct list_head pq[UBI_PROT_QUEUE_LEN]; + int pq_head; spinlock_t wl_lock; + struct mutex move_mutex; + struct rw_semaphore work_sem; int wl_scheduled; struct ubi_wl_entry **lookuptbl; - unsigned long long abs_ec; struct ubi_wl_entry *move_from; struct ubi_wl_entry *move_to; - int move_from_put; int move_to_put; struct list_head works; int works_count; @@ -333,7 +421,7 @@ int thread_enabled; char bgt_name[sizeof(UBI_BGT_NAME_PATTERN)+2]; - /* I/O unit's stuff */ + /* I/O sub-system's stuff */ long long flash_size; int peb_count; int peb_size; @@ -355,35 +443,49 @@ void *peb_buf1; void *peb_buf2; struct mutex buf_mutex; + struct mutex ckvol_mutex; + struct mutex mult_mutex; #ifdef CONFIG_MTD_UBI_DEBUG void *dbg_peb_buf; struct mutex dbg_buf_mutex; #endif }; -extern struct file_operations ubi_cdev_operations; -extern struct file_operations ubi_vol_cdev_operations; +extern struct kmem_cache *ubi_wl_entry_slab; +extern const struct file_operations ubi_ctrl_cdev_operations; +extern const struct file_operations ubi_cdev_operations; +extern const struct file_operations ubi_vol_cdev_operations; extern struct class *ubi_class; +extern struct mutex ubi_devices_mutex; /* vtbl.c */ int ubi_change_vtbl_record(struct ubi_device *ubi, int idx, struct ubi_vtbl_record *vtbl_rec); +int ubi_vtbl_rename_volumes(struct ubi_device *ubi, + struct list_head *rename_list); int ubi_read_volume_table(struct ubi_device *ubi, struct ubi_scan_info *si); /* vmt.c */ int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req); -int ubi_remove_volume(struct ubi_volume_desc *desc); +int ubi_remove_volume(struct ubi_volume_desc *desc, int no_vtbl); int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs); -int ubi_add_volume(struct ubi_device *ubi, int vol_id); -void ubi_free_volume(struct ubi_device *ubi, int vol_id); +int ubi_rename_volumes(struct ubi_device *ubi, struct list_head *rename_list); +int ubi_add_volume(struct ubi_device *ubi, struct ubi_volume *vol); +void ubi_free_volume(struct ubi_device *ubi, struct ubi_volume *vol); /* upd.c */ -int ubi_start_update(struct ubi_device *ubi, int vol_id, long long bytes); -int ubi_more_update_data(struct ubi_device *ubi, int vol_id, +int ubi_start_update(struct ubi_device *ubi, struct ubi_volume *vol, + long long bytes); +int ubi_more_update_data(struct ubi_device *ubi, struct ubi_volume *vol, const void __user *buf, int count); +int ubi_start_leb_change(struct ubi_device *ubi, struct ubi_volume *vol, + const struct ubi_leb_change_req *req); +int ubi_more_leb_change_data(struct ubi_device *ubi, struct ubi_volume *vol, + const void __user *buf, int count); /* misc.c */ -int ubi_calc_data_len(const struct ubi_device *ubi, const void *buf, int length); +int ubi_calc_data_len(const struct ubi_device *ubi, const void *buf, + int length); int ubi_check_volume(struct ubi_device *ubi, int vol_id); void ubi_calculate_reserved(struct ubi_device *ubi); @@ -399,20 +501,20 @@ #endif /* eba.c */ -int ubi_eba_unmap_leb(struct ubi_device *ubi, int vol_id, int lnum); -int ubi_eba_read_leb(struct ubi_device *ubi, int vol_id, int lnum, void *buf, - int offset, int len, int check); -int ubi_eba_write_leb(struct ubi_device *ubi, int vol_id, int lnum, +int ubi_eba_unmap_leb(struct ubi_device *ubi, struct ubi_volume *vol, + int lnum); +int ubi_eba_read_leb(struct ubi_device *ubi, struct ubi_volume *vol, int lnum, + void *buf, int offset, int len, int check); +int ubi_eba_write_leb(struct ubi_device *ubi, struct ubi_volume *vol, int lnum, const void *buf, int offset, int len, int dtype); -int ubi_eba_write_leb_st(struct ubi_device *ubi, int vol_id, int lnum, - const void *buf, int len, int dtype, +int ubi_eba_write_leb_st(struct ubi_device *ubi, struct ubi_volume *vol, + int lnum, const void *buf, int len, int dtype, int used_ebs); -int ubi_eba_atomic_leb_change(struct ubi_device *ubi, int vol_id, int lnum, - const void *buf, int len, int dtype); +int ubi_eba_atomic_leb_change(struct ubi_device *ubi, struct ubi_volume *vol, + int lnum, const void *buf, int len, int dtype); int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to, struct ubi_vid_hdr *vid_hdr); int ubi_eba_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si); -void ubi_eba_close(const struct ubi_device *ubi); /* wl.c */ int ubi_wl_get_peb(struct ubi_device *ubi, int dtype); @@ -421,6 +523,7 @@ int ubi_wl_scrub_peb(struct ubi_device *ubi, int pnum); int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si); void ubi_wl_close(struct ubi_device *ubi); +int ubi_thread(void *u); /* io.c */ int ubi_io_read(const struct ubi_device *ubi, void *buf, int pnum, int offset, @@ -439,6 +542,14 @@ int ubi_io_write_vid_hdr(struct ubi_device *ubi, int pnum, struct ubi_vid_hdr *vid_hdr); +/* build.c */ +int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, int vid_hdr_offset); +int ubi_detach_mtd_dev(int ubi_num, int anyway); +struct ubi_device *ubi_get_device(int ubi_num); +void ubi_put_device(struct ubi_device *ubi); +struct ubi_device *ubi_get_by_major(int major); +int ubi_major2num(int major); + /* * ubi_rb_for_each_entry - walk an RB-tree. * @rb: a pointer to type 'struct rb_node' to to use as a loop counter @@ -523,8 +634,10 @@ */ static inline void ubi_ro_mode(struct ubi_device *ubi) { - ubi->ro_mode = 1; - ubi_warn("switch to read-only mode"); + if (!ubi->ro_mode) { + ubi->ro_mode = 1; + ubi_warn("switch to read-only mode"); + } } /** diff -Nurd linux-2.6.24/drivers/mtd/ubi/upd.c ubifs-v2.6.24/drivers/mtd/ubi/upd.c --- linux-2.6.24/drivers/mtd/ubi/upd.c 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/upd.c 2009-04-07 17:14:47.000000000 +0200 @@ -22,7 +22,8 @@ */ /* - * This file contains implementation of the volume update functionality. + * This file contains implementation of the volume update and atomic LEB change + * functionality. * * The update operation is based on the per-volume update marker which is * stored in the volume table. The update marker is set before the update @@ -38,36 +39,37 @@ */ #include -#include -#include +#include #include "ubi.h" /** * set_update_marker - set update marker. * @ubi: UBI device description object - * @vol_id: volume ID + * @vol: volume description object * - * This function sets the update marker flag for volume @vol_id. Returns zero + * This function sets the update marker flag for volume @vol. Returns zero * in case of success and a negative error code in case of failure. */ -static int set_update_marker(struct ubi_device *ubi, int vol_id) +static int set_update_marker(struct ubi_device *ubi, struct ubi_volume *vol) { int err; struct ubi_vtbl_record vtbl_rec; - struct ubi_volume *vol = ubi->volumes[vol_id]; - dbg_msg("set update marker for volume %d", vol_id); + dbg_gen("set update marker for volume %d", vol->vol_id); if (vol->upd_marker) { - ubi_assert(ubi->vtbl[vol_id].upd_marker); - dbg_msg("already set"); + ubi_assert(ubi->vtbl[vol->vol_id].upd_marker); + dbg_gen("already set"); return 0; } - memcpy(&vtbl_rec, &ubi->vtbl[vol_id], sizeof(struct ubi_vtbl_record)); + memcpy(&vtbl_rec, &ubi->vtbl[vol->vol_id], + sizeof(struct ubi_vtbl_record)); vtbl_rec.upd_marker = 1; - err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); + mutex_lock(&ubi->volumes_mutex); + err = ubi_change_vtbl_record(ubi, vol->vol_id, &vtbl_rec); + mutex_unlock(&ubi->volumes_mutex); vol->upd_marker = 1; return err; } @@ -75,38 +77,40 @@ /** * clear_update_marker - clear update marker. * @ubi: UBI device description object - * @vol_id: volume ID + * @vol: volume description object * @bytes: new data size in bytes * - * This function clears the update marker for volume @vol_id, sets new volume + * This function clears the update marker for volume @vol, sets new volume * data size and clears the "corrupted" flag (static volumes only). Returns * zero in case of success and a negative error code in case of failure. */ -static int clear_update_marker(struct ubi_device *ubi, int vol_id, long long bytes) +static int clear_update_marker(struct ubi_device *ubi, struct ubi_volume *vol, + long long bytes) { int err; - uint64_t tmp; struct ubi_vtbl_record vtbl_rec; - struct ubi_volume *vol = ubi->volumes[vol_id]; - dbg_msg("clear update marker for volume %d", vol_id); + dbg_gen("clear update marker for volume %d", vol->vol_id); - memcpy(&vtbl_rec, &ubi->vtbl[vol_id], sizeof(struct ubi_vtbl_record)); + memcpy(&vtbl_rec, &ubi->vtbl[vol->vol_id], + sizeof(struct ubi_vtbl_record)); ubi_assert(vol->upd_marker && vtbl_rec.upd_marker); vtbl_rec.upd_marker = 0; if (vol->vol_type == UBI_STATIC_VOLUME) { vol->corrupted = 0; - vol->used_bytes = tmp = bytes; - vol->last_eb_bytes = do_div(tmp, vol->usable_leb_size); - vol->used_ebs = tmp; + vol->used_bytes = bytes; + vol->used_ebs = div_u64_rem(bytes, vol->usable_leb_size, + &vol->last_eb_bytes); if (vol->last_eb_bytes) vol->used_ebs += 1; else vol->last_eb_bytes = vol->usable_leb_size; } - err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); + mutex_lock(&ubi->volumes_mutex); + err = ubi_change_vtbl_record(ubi, vol->vol_id, &vtbl_rec); + mutex_unlock(&ubi->volumes_mutex); vol->upd_marker = 0; return err; } @@ -114,35 +118,35 @@ /** * ubi_start_update - start volume update. * @ubi: UBI device description object - * @vol_id: volume ID + * @vol: volume description object * @bytes: update bytes * * This function starts volume update operation. If @bytes is zero, the volume * is just wiped out. Returns zero in case of success and a negative error code * in case of failure. */ -int ubi_start_update(struct ubi_device *ubi, int vol_id, long long bytes) +int ubi_start_update(struct ubi_device *ubi, struct ubi_volume *vol, + long long bytes) { int i, err; - uint64_t tmp; - struct ubi_volume *vol = ubi->volumes[vol_id]; - dbg_msg("start update of volume %d, %llu bytes", vol_id, bytes); + dbg_gen("start update of volume %d, %llu bytes", vol->vol_id, bytes); + ubi_assert(!vol->updating && !vol->changing_leb); vol->updating = 1; - err = set_update_marker(ubi, vol_id); + err = set_update_marker(ubi, vol); if (err) return err; /* Before updating - wipe out the volume */ for (i = 0; i < vol->reserved_pebs; i++) { - err = ubi_eba_unmap_leb(ubi, vol_id, i); + err = ubi_eba_unmap_leb(ubi, vol, i); if (err) return err; } if (bytes == 0) { - err = clear_update_marker(ubi, vol_id, 0); + err = clear_update_marker(ubi, vol, 0); if (err) return err; err = ubi_wl_flush(ubi); @@ -154,18 +158,50 @@ if (!vol->upd_buf) return -ENOMEM; - tmp = bytes; - vol->upd_ebs = !!do_div(tmp, vol->usable_leb_size); - vol->upd_ebs += tmp; + vol->upd_ebs = div_u64(bytes + vol->usable_leb_size - 1, + vol->usable_leb_size); vol->upd_bytes = bytes; vol->upd_received = 0; return 0; } /** + * ubi_start_leb_change - start atomic LEB change. + * @ubi: UBI device description object + * @vol: volume description object + * @req: operation request + * + * This function starts atomic LEB change operation. Returns zero in case of + * success and a negative error code in case of failure. + */ +int ubi_start_leb_change(struct ubi_device *ubi, struct ubi_volume *vol, + const struct ubi_leb_change_req *req) +{ + ubi_assert(!vol->updating && !vol->changing_leb); + + dbg_gen("start changing LEB %d:%d, %u bytes", + vol->vol_id, req->lnum, req->bytes); + if (req->bytes == 0) + return ubi_eba_atomic_leb_change(ubi, vol, req->lnum, NULL, 0, + req->dtype); + + vol->upd_bytes = req->bytes; + vol->upd_received = 0; + vol->changing_leb = 1; + vol->ch_lnum = req->lnum; + vol->ch_dtype = req->dtype; + + vol->upd_buf = vmalloc(req->bytes); + if (!vol->upd_buf) + return -ENOMEM; + + return 0; +} + +/** * write_leb - write update data. * @ubi: UBI device description object - * @vol_id: volume ID + * @vol: volume description object * @lnum: logical eraseblock number * @buf: data to write * @len: data size @@ -191,25 +227,22 @@ * This function returns zero in case of success and a negative error code in * case of failure. */ -static int write_leb(struct ubi_device *ubi, int vol_id, int lnum, void *buf, - int len, int used_ebs) +static int write_leb(struct ubi_device *ubi, struct ubi_volume *vol, int lnum, + void *buf, int len, int used_ebs) { - int err, l; - struct ubi_volume *vol = ubi->volumes[vol_id]; + int err; if (vol->vol_type == UBI_DYNAMIC_VOLUME) { - l = ALIGN(len, ubi->min_io_size); - memset(buf + len, 0xFF, l - len); + int l = ALIGN(len, ubi->min_io_size); - l = ubi_calc_data_len(ubi, buf, l); - if (l == 0) { - dbg_msg("all %d bytes contain 0xFF - skip", len); + memset(buf + len, 0xFF, l - len); + len = ubi_calc_data_len(ubi, buf, l); + if (len == 0) { + dbg_gen("all %d bytes contain 0xFF - skip", len); return 0; } - if (len != l) - dbg_msg("skip last %d bytes (0xFF)", len - l); - err = ubi_eba_write_leb(ubi, vol_id, lnum, buf, 0, l, + err = ubi_eba_write_leb(ubi, vol, lnum, buf, 0, len, UBI_UNKNOWN); } else { /* @@ -222,7 +255,7 @@ * contain zeros, not random trash. */ memset(buf + len, 0, vol->usable_leb_size - len); - err = ubi_eba_write_leb_st(ubi, vol_id, lnum, buf, len, + err = ubi_eba_write_leb_st(ubi, vol, lnum, buf, len, UBI_UNKNOWN, used_ebs); } @@ -231,33 +264,29 @@ /** * ubi_more_update_data - write more update data. + * @ubi: UBI device description object * @vol: volume description object * @buf: write data (user-space memory buffer) * @count: how much bytes to write * * This function writes more data to the volume which is being updated. It may - * be called arbitrary number of times until all of the update data arrive. - * This function returns %0 in case of success, number of bytes written during - * the last call if the whole volume update was successfully finished, and a + * be called arbitrary number of times until all the update data arriveis. This + * function returns %0 in case of success, number of bytes written during the + * last call if the whole volume update has been successfully finished, and a * negative error code in case of failure. */ -int ubi_more_update_data(struct ubi_device *ubi, int vol_id, +int ubi_more_update_data(struct ubi_device *ubi, struct ubi_volume *vol, const void __user *buf, int count) { - uint64_t tmp; - struct ubi_volume *vol = ubi->volumes[vol_id]; int lnum, offs, err = 0, len, to_write = count; - dbg_msg("write %d of %lld bytes, %lld already passed", + dbg_gen("write %d of %lld bytes, %lld already passed", count, vol->upd_bytes, vol->upd_received); if (ubi->ro_mode) return -EROFS; - tmp = vol->upd_received; - offs = do_div(tmp, vol->usable_leb_size); - lnum = tmp; - + lnum = div_u64_rem(vol->upd_received, vol->usable_leb_size, &offs); if (vol->upd_received + count > vol->upd_bytes) to_write = count = vol->upd_bytes - vol->upd_received; @@ -290,8 +319,8 @@ * is the last chunk, it's time to flush the buffer. */ ubi_assert(flush_len <= vol->usable_leb_size); - err = write_leb(ubi, vol_id, lnum, vol->upd_buf, - flush_len, vol->upd_ebs); + err = write_leb(ubi, vol, lnum, vol->upd_buf, flush_len, + vol->upd_ebs); if (err) return err; } @@ -318,8 +347,8 @@ if (len == vol->usable_leb_size || vol->upd_received + len == vol->upd_bytes) { - err = write_leb(ubi, vol_id, lnum, vol->upd_buf, len, - vol->upd_ebs); + err = write_leb(ubi, vol, lnum, vol->upd_buf, + len, vol->upd_ebs); if (err) break; } @@ -333,16 +362,72 @@ ubi_assert(vol->upd_received <= vol->upd_bytes); if (vol->upd_received == vol->upd_bytes) { /* The update is finished, clear the update marker */ - err = clear_update_marker(ubi, vol_id, vol->upd_bytes); + err = clear_update_marker(ubi, vol, vol->upd_bytes); if (err) return err; err = ubi_wl_flush(ubi); if (err == 0) { + vol->updating = 0; err = to_write; vfree(vol->upd_buf); - vol->updating = 0; } } return err; } + +/** + * ubi_more_leb_change_data - accept more data for atomic LEB change. + * @ubi: UBI device description object + * @vol: volume description object + * @buf: write data (user-space memory buffer) + * @count: how much bytes to write + * + * This function accepts more data to the volume which is being under the + * "atomic LEB change" operation. It may be called arbitrary number of times + * until all data arrives. This function returns %0 in case of success, number + * of bytes written during the last call if the whole "atomic LEB change" + * operation has been successfully finished, and a negative error code in case + * of failure. + */ +int ubi_more_leb_change_data(struct ubi_device *ubi, struct ubi_volume *vol, + const void __user *buf, int count) +{ + int err; + + dbg_gen("write %d of %lld bytes, %lld already passed", + count, vol->upd_bytes, vol->upd_received); + + if (ubi->ro_mode) + return -EROFS; + + if (vol->upd_received + count > vol->upd_bytes) + count = vol->upd_bytes - vol->upd_received; + + err = copy_from_user(vol->upd_buf + vol->upd_received, buf, count); + if (err) + return -EFAULT; + + vol->upd_received += count; + + if (vol->upd_received == vol->upd_bytes) { + int len = ALIGN((int)vol->upd_bytes, ubi->min_io_size); + + memset(vol->upd_buf + vol->upd_bytes, 0xFF, + len - vol->upd_bytes); + len = ubi_calc_data_len(ubi, vol->upd_buf, len); + err = ubi_eba_atomic_leb_change(ubi, vol, vol->ch_lnum, + vol->upd_buf, len, UBI_UNKNOWN); + if (err) + return err; + } + + ubi_assert(vol->upd_received <= vol->upd_bytes); + if (vol->upd_received == vol->upd_bytes) { + vol->changing_leb = 0; + err = count; + vfree(vol->upd_buf); + } + + return err; +} diff -Nurd linux-2.6.24/drivers/mtd/ubi/vmt.c ubifs-v2.6.24/drivers/mtd/ubi/vmt.c --- linux-2.6.24/drivers/mtd/ubi/vmt.c 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/vmt.c 2009-04-07 17:14:47.000000000 +0200 @@ -24,13 +24,12 @@ */ #include -#include #include "ubi.h" #ifdef CONFIG_MTD_UBI_DEBUG_PARANOID -static void paranoid_check_volumes(struct ubi_device *ubi); +static int paranoid_check_volumes(struct ubi_device *ubi); #else -#define paranoid_check_volumes(ubi) +#define paranoid_check_volumes(ubi) 0 #endif static ssize_t vol_attribute_show(struct device *dev, @@ -63,21 +62,30 @@ * B. process 2 removes volume Y; * C. process 1 starts reading the //class/ubi/ubiX_Y/reserved_ebs file; * - * What we want to do in a situation like that is to return error when the file - * is read. This is done by means of the 'removed' flag and the 'vol_lock' of - * the UBI volume description object. + * In this situation, this function will return %-ENODEV because it will find + * out that the volume was removed from the @ubi->volumes array. */ static ssize_t vol_attribute_show(struct device *dev, struct device_attribute *attr, char *buf) { int ret; struct ubi_volume *vol = container_of(dev, struct ubi_volume, dev); + struct ubi_device *ubi; - spin_lock(&vol->ubi->volumes_lock); - if (vol->removed) { - spin_unlock(&vol->ubi->volumes_lock); + ubi = ubi_get_device(vol->ubi->ubi_num); + if (!ubi) + return -ENODEV; + + spin_lock(&ubi->volumes_lock); + if (!ubi->volumes[vol->vol_id]) { + spin_unlock(&ubi->volumes_lock); + ubi_put_device(ubi); return -ENODEV; } + /* Take a reference to prevent volume removal */ + vol->ref_count += 1; + spin_unlock(&ubi->volumes_lock); + if (attr == &attr_vol_reserved_ebs) ret = sprintf(buf, "%d\n", vol->reserved_pebs); else if (attr == &attr_vol_type) { @@ -94,15 +102,22 @@ ret = sprintf(buf, "%d\n", vol->corrupted); else if (attr == &attr_vol_alignment) ret = sprintf(buf, "%d\n", vol->alignment); - else if (attr == &attr_vol_usable_eb_size) { + else if (attr == &attr_vol_usable_eb_size) ret = sprintf(buf, "%d\n", vol->usable_leb_size); - } else if (attr == &attr_vol_data_bytes) + else if (attr == &attr_vol_data_bytes) ret = sprintf(buf, "%lld\n", vol->used_bytes); else if (attr == &attr_vol_upd_marker) ret = sprintf(buf, "%d\n", vol->upd_marker); else - BUG(); - spin_unlock(&vol->ubi->volumes_lock); + /* This must be a bug */ + ret = -EINVAL; + + /* We've done the operation, drop volume and UBI device references */ + spin_lock(&ubi->volumes_lock); + vol->ref_count -= 1; + ubi_assert(vol->ref_count >= 0); + spin_unlock(&ubi->volumes_lock); + ubi_put_device(ubi); return ret; } @@ -110,7 +125,8 @@ static void vol_release(struct device *dev) { struct ubi_volume *vol = container_of(dev, struct ubi_volume, dev); - ubi_assert(vol->removed); + + kfree(vol->eba_tbl); kfree(vol); } @@ -152,9 +168,7 @@ if (err) return err; err = device_create_file(&vol->dev, &attr_vol_upd_marker); - if (err) - return err; - return 0; + return err; } /** @@ -180,16 +194,17 @@ * @req: volume creation request * * This function creates volume described by @req. If @req->vol_id id - * %UBI_VOL_NUM_AUTO, this function automatically assigne ID to the new volume + * %UBI_VOL_NUM_AUTO, this function automatically assign ID to the new volume * and saves it in @req->vol_id. Returns zero in case of success and a negative - * error code in case of failure. + * error code in case of failure. Note, the caller has to have the + * @ubi->volumes_mutex locked. */ int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req) { - int i, err, vol_id = req->vol_id; + int i, err, vol_id = req->vol_id, do_free = 1; struct ubi_volume *vol; struct ubi_vtbl_record vtbl_rec; - uint64_t bytes; + dev_t dev; if (ubi->ro_mode) return -EROFS; @@ -199,10 +214,9 @@ return -ENOMEM; spin_lock(&ubi->volumes_lock); - if (vol_id == UBI_VOL_NUM_AUTO) { /* Find unused volume ID */ - dbg_msg("search for vacant volume ID"); + dbg_gen("search for vacant volume ID"); for (i = 0; i < ubi->vtbl_slots; i++) if (!ubi->volumes[i]) { vol_id = i; @@ -217,7 +231,7 @@ req->vol_id = vol_id; } - dbg_msg("volume ID %d, %llu bytes, type %d, name %s", + dbg_gen("volume ID %d, %llu bytes, type %d, name %s", vol_id, (unsigned long long)req->bytes, (int)req->vol_type, req->name); @@ -237,12 +251,10 @@ goto out_unlock; } - /* Calculate how many eraseblocks are requested */ + /* Calculate how many eraseblocks are requested */ vol->usable_leb_size = ubi->leb_size - ubi->leb_size % req->alignment; - bytes = req->bytes; - if (do_div(bytes, vol->usable_leb_size)) - vol->reserved_pebs = 1; - vol->reserved_pebs += bytes; + vol->reserved_pebs += div_u64(req->bytes + vol->usable_leb_size - 1, + vol->usable_leb_size); /* Reserve physical eraseblocks */ if (vol->reserved_pebs > ubi->avail_pebs) { @@ -252,17 +264,15 @@ } ubi->avail_pebs -= vol->reserved_pebs; ubi->rsvd_pebs += vol->reserved_pebs; + spin_unlock(&ubi->volumes_lock); vol->vol_id = vol_id; vol->alignment = req->alignment; vol->data_pad = ubi->leb_size % vol->alignment; vol->vol_type = req->vol_type; vol->name_len = req->name_len; - memcpy(vol->name, req->name, vol->name_len + 1); - vol->exclusive = 1; + memcpy(vol->name, req->name, vol->name_len); vol->ubi = ubi; - ubi->volumes[vol_id] = vol; - spin_unlock(&ubi->volumes_lock); /* * Finish all pending erases because there may be some LEBs belonging @@ -287,10 +297,10 @@ vol->used_bytes = (long long)vol->used_ebs * vol->usable_leb_size; } else { - bytes = vol->used_bytes; - vol->last_eb_bytes = do_div(bytes, vol->usable_leb_size); - vol->used_ebs = bytes; - if (vol->last_eb_bytes) + vol->used_ebs = div_u64_rem(vol->used_bytes, + vol->usable_leb_size, + &vol->last_eb_bytes); + if (vol->last_eb_bytes != 0) vol->used_ebs += 1; else vol->last_eb_bytes = vol->usable_leb_size; @@ -299,9 +309,10 @@ /* Register character device for the volume */ cdev_init(&vol->cdev, &ubi_vol_cdev_operations); vol->cdev.owner = THIS_MODULE; - err = cdev_add(&vol->cdev, MKDEV(ubi->major, vol_id + 1), 1); + dev = MKDEV(MAJOR(ubi->cdev.dev), vol_id + 1); + err = cdev_add(&vol->cdev, dev, 1); if (err) { - ubi_err("cannot add character device for volume %d", vol_id); + ubi_err("cannot add character device"); goto out_mapping; } @@ -311,12 +322,15 @@ vol->dev.release = vol_release; vol->dev.parent = &ubi->dev; - vol->dev.devt = MKDEV(ubi->major, vol->vol_id + 1); + vol->dev.devt = dev; vol->dev.class = ubi_class; + sprintf(&vol->dev.bus_id[0], "%s_%d", ubi->ubi_name, vol->vol_id); err = device_register(&vol->dev); - if (err) + if (err) { + ubi_err("cannot register device"); goto out_gluebi; + } err = volume_sysfs_init(ubi, vol); if (err) @@ -332,98 +346,108 @@ vtbl_rec.vol_type = UBI_VID_DYNAMIC; else vtbl_rec.vol_type = UBI_VID_STATIC; - memcpy(vtbl_rec.name, vol->name, vol->name_len + 1); + memcpy(vtbl_rec.name, vol->name, vol->name_len); err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec); if (err) goto out_sysfs; spin_lock(&ubi->volumes_lock); + ubi->volumes[vol_id] = vol; ubi->vol_count += 1; - vol->exclusive = 0; spin_unlock(&ubi->volumes_lock); - paranoid_check_volumes(ubi); - return 0; + err = paranoid_check_volumes(ubi); + return err; +out_sysfs: + /* + * We have registered our device, we should not free the volume + * description object in this function in case of an error - it is + * freed by the release function. + * + * Get device reference to prevent the release function from being + * called just after sysfs has been closed. + */ + do_free = 0; + get_device(&vol->dev); + volume_sysfs_close(vol); out_gluebi: - err = ubi_destroy_gluebi(vol); + if (ubi_destroy_gluebi(vol)) + dbg_err("cannot destroy gluebi for volume %d:%d", + ubi->ubi_num, vol_id); out_cdev: cdev_del(&vol->cdev); out_mapping: - kfree(vol->eba_tbl); + if (do_free) + kfree(vol->eba_tbl); out_acc: spin_lock(&ubi->volumes_lock); ubi->rsvd_pebs -= vol->reserved_pebs; ubi->avail_pebs += vol->reserved_pebs; - ubi->volumes[vol_id] = NULL; out_unlock: spin_unlock(&ubi->volumes_lock); - kfree(vol); - return err; - - /* - * We are registered, so @vol is destroyed in the release function and - * we have to de-initialize differently. - */ -out_sysfs: - err = ubi_destroy_gluebi(vol); - cdev_del(&vol->cdev); - kfree(vol->eba_tbl); - spin_lock(&ubi->volumes_lock); - ubi->rsvd_pebs -= vol->reserved_pebs; - ubi->avail_pebs += vol->reserved_pebs; - ubi->volumes[vol_id] = NULL; - spin_unlock(&ubi->volumes_lock); - volume_sysfs_close(vol); + if (do_free) + kfree(vol); + else + put_device(&vol->dev); + ubi_err("cannot create volume %d, error %d", vol_id, err); return err; } /** * ubi_remove_volume - remove volume. * @desc: volume descriptor + * @no_vtbl: do not change volume table if not zero * * This function removes volume described by @desc. The volume has to be opened * in "exclusive" mode. Returns zero in case of success and a negative error - * code in case of failure. + * code in case of failure. The caller has to have the @ubi->volumes_mutex + * locked. */ -int ubi_remove_volume(struct ubi_volume_desc *desc) +int ubi_remove_volume(struct ubi_volume_desc *desc, int no_vtbl) { struct ubi_volume *vol = desc->vol; struct ubi_device *ubi = vol->ubi; int i, err, vol_id = vol->vol_id, reserved_pebs = vol->reserved_pebs; - dbg_msg("remove UBI volume %d", vol_id); + dbg_gen("remove UBI volume %d", vol_id); ubi_assert(desc->mode == UBI_EXCLUSIVE); ubi_assert(vol == ubi->volumes[vol_id]); if (ubi->ro_mode) return -EROFS; + spin_lock(&ubi->volumes_lock); + if (vol->ref_count > 1) { + /* + * The volume is busy, probably someone is reading one of its + * sysfs files. + */ + err = -EBUSY; + goto out_unlock; + } + ubi->volumes[vol_id] = NULL; + spin_unlock(&ubi->volumes_lock); + err = ubi_destroy_gluebi(vol); if (err) - return err; + goto out_err; - err = ubi_change_vtbl_record(ubi, vol_id, NULL); - if (err) - return err; + if (!no_vtbl) { + err = ubi_change_vtbl_record(ubi, vol_id, NULL); + if (err) + goto out_err; + } for (i = 0; i < vol->reserved_pebs; i++) { - err = ubi_eba_unmap_leb(ubi, vol_id, i); + err = ubi_eba_unmap_leb(ubi, vol, i); if (err) - return err; + goto out_err; } - spin_lock(&ubi->volumes_lock); - vol->removed = 1; - ubi->volumes[vol_id] = NULL; - spin_unlock(&ubi->volumes_lock); - - kfree(vol->eba_tbl); - vol->eba_tbl = NULL; cdev_del(&vol->cdev); volume_sysfs_close(vol); - kfree(desc); spin_lock(&ubi->volumes_lock); ubi->rsvd_pebs -= reserved_pebs; @@ -440,9 +464,17 @@ ubi->vol_count -= 1; spin_unlock(&ubi->volumes_lock); - paranoid_check_volumes(ubi); - module_put(THIS_MODULE); - return 0; + if (!no_vtbl) + err = paranoid_check_volumes(ubi); + return err; + +out_err: + ubi_err("cannot remove volume %d, error %d", vol_id, err); + spin_lock(&ubi->volumes_lock); + ubi->volumes[vol_id] = vol; +out_unlock: + spin_unlock(&ubi->volumes_lock); + return err; } /** @@ -450,8 +482,9 @@ * @desc: volume descriptor * @reserved_pebs: new size in physical eraseblocks * - * This function returns zero in case of success, and a negative error code in - * case of failure. + * This function re-sizes the volume and returns zero in case of success, and a + * negative error code in case of failure. The caller has to have the + * @ubi->volumes_mutex locked. */ int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs) { @@ -464,10 +497,8 @@ if (ubi->ro_mode) return -EROFS; - dbg_msg("re-size volume %d to from %d to %d PEBs", + dbg_gen("re-size volume %d to from %d to %d PEBs", vol_id, vol->reserved_pebs, reserved_pebs); - ubi_assert(desc->mode == UBI_EXCLUSIVE); - ubi_assert(vol == ubi->volumes[vol_id]); if (vol->vol_type == UBI_STATIC_VOLUME && reserved_pebs < vol->used_ebs) { @@ -487,6 +518,14 @@ for (i = 0; i < reserved_pebs; i++) new_mapping[i] = UBI_LEB_UNMAPPED; + spin_lock(&ubi->volumes_lock); + if (vol->ref_count > 1) { + spin_unlock(&ubi->volumes_lock); + err = -EBUSY; + goto out_free; + } + spin_unlock(&ubi->volumes_lock); + /* Reserve physical eraseblocks */ pebs = reserved_pebs - vol->reserved_pebs; if (pebs > 0) { @@ -516,7 +555,7 @@ if (pebs < 0) { for (i = 0; i < -pebs; i++) { - err = ubi_eba_unmap_leb(ubi, vol_id, reserved_pebs + i); + err = ubi_eba_unmap_leb(ubi, vol, reserved_pebs + i); if (err) goto out_acc; } @@ -547,8 +586,8 @@ (long long)vol->used_ebs * vol->usable_leb_size; } - paranoid_check_volumes(ubi); - return 0; + err = paranoid_check_volumes(ubi); + return err; out_acc: if (pebs > 0) { @@ -563,29 +602,67 @@ } /** + * ubi_rename_volumes - re-name UBI volumes. + * @ubi: UBI device description object + * @rename_list: list of &struct ubi_rename_entry objects + * + * This function re-names or removes volumes specified in the re-name list. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubi_rename_volumes(struct ubi_device *ubi, struct list_head *rename_list) +{ + int err; + struct ubi_rename_entry *re; + + err = ubi_vtbl_rename_volumes(ubi, rename_list); + if (err) + return err; + + list_for_each_entry(re, rename_list, list) { + if (re->remove) { + err = ubi_remove_volume(re->desc, 1); + if (err) + break; + } else { + struct ubi_volume *vol = re->desc->vol; + + spin_lock(&ubi->volumes_lock); + vol->name_len = re->new_name_len; + memcpy(vol->name, re->new_name, re->new_name_len + 1); + spin_unlock(&ubi->volumes_lock); + } + } + + if (!err) + err = paranoid_check_volumes(ubi); + return err; +} + +/** * ubi_add_volume - add volume. * @ubi: UBI device description object - * @vol_id: volume ID + * @vol: volume description object * - * This function adds an existin volume and initializes all its data - * structures. Returnes zero in case of success and a negative error code in + * This function adds an existing volume and initializes all its data + * structures. Returns zero in case of success and a negative error code in * case of failure. */ -int ubi_add_volume(struct ubi_device *ubi, int vol_id) +int ubi_add_volume(struct ubi_device *ubi, struct ubi_volume *vol) { - int err; - struct ubi_volume *vol = ubi->volumes[vol_id]; + int err, vol_id = vol->vol_id; + dev_t dev; - dbg_msg("add volume %d", vol_id); - ubi_dbg_dump_vol_info(vol); - ubi_assert(vol); + dbg_gen("add volume %d", vol_id); /* Register character device for the volume */ cdev_init(&vol->cdev, &ubi_vol_cdev_operations); vol->cdev.owner = THIS_MODULE; - err = cdev_add(&vol->cdev, MKDEV(ubi->major, vol->vol_id + 1), 1); + dev = MKDEV(MAJOR(ubi->cdev.dev), vol->vol_id + 1); + err = cdev_add(&vol->cdev, dev, 1); if (err) { - ubi_err("cannot add character device for volume %d", vol_id); + ubi_err("cannot add character device for volume %d, error %d", + vol_id, err); return err; } @@ -595,7 +672,7 @@ vol->dev.release = vol_release; vol->dev.parent = &ubi->dev; - vol->dev.devt = MKDEV(ubi->major, vol->vol_id + 1); + vol->dev.devt = dev; vol->dev.class = ubi_class; sprintf(&vol->dev.bus_id[0], "%s_%d", ubi->ubi_name, vol->vol_id); err = device_register(&vol->dev); @@ -610,8 +687,8 @@ return err; } - paranoid_check_volumes(ubi); - return 0; + err = paranoid_check_volumes(ubi); + return err; out_gluebi: err = ubi_destroy_gluebi(vol); @@ -623,22 +700,19 @@ /** * ubi_free_volume - free volume. * @ubi: UBI device description object - * @vol_id: volume ID + * @vol: volume description object * - * This function frees all resources for volume @vol_id but does not remove it. + * This function frees all resources for volume @vol but does not remove it. * Used only when the UBI device is detached. */ -void ubi_free_volume(struct ubi_device *ubi, int vol_id) +void ubi_free_volume(struct ubi_device *ubi, struct ubi_volume *vol) { int err; - struct ubi_volume *vol = ubi->volumes[vol_id]; - dbg_msg("free volume %d", vol_id); - ubi_assert(vol); + dbg_gen("free volume %d", vol->vol_id); - vol->removed = 1; + ubi->volumes[vol->vol_id] = NULL; err = ubi_destroy_gluebi(vol); - ubi->volumes[vol_id] = NULL; cdev_del(&vol->cdev); volume_sysfs_close(vol); } @@ -649,8 +723,10 @@ * paranoid_check_volume - check volume information. * @ubi: UBI device description object * @vol_id: volume ID + * + * Returns zero if volume is all right and a a negative error code if not. */ -static void paranoid_check_volume(struct ubi_device *ubi, int vol_id) +static int paranoid_check_volume(struct ubi_device *ubi, int vol_id) { int idx = vol_id2idx(ubi, vol_id); int reserved_pebs, alignment, data_pad, vol_type, name_len, upd_marker; @@ -668,16 +744,7 @@ goto fail; } spin_unlock(&ubi->volumes_lock); - return; - } - - if (vol->exclusive) { - /* - * The volume may be being created at the moment, do not check - * it (e.g., it may be in the middle of ubi_create_volume(). - */ - spin_unlock(&ubi->volumes_lock); - return; + return 0; } if (vol->reserved_pebs < 0 || vol->alignment < 0 || vol->data_pad < 0 || @@ -690,7 +757,7 @@ goto fail; } - n = vol->alignment % ubi->min_io_size; + n = vol->alignment & (ubi->min_io_size - 1); if (vol->alignment != 1 && n) { ubi_err("alignment is not multiple of min I/O unit"); goto fail; @@ -708,11 +775,6 @@ goto fail; } - if (vol->upd_marker != 0 && vol->upd_marker != 1) { - ubi_err("bad upd_marker"); - goto fail; - } - if (vol->upd_marker && vol->corrupted) { dbg_err("update marker and corrupted simultaneously"); goto fail; @@ -747,7 +809,7 @@ n = (long long)vol->used_ebs * vol->usable_leb_size; if (vol->vol_type == UBI_DYNAMIC_VOLUME) { - if (vol->corrupted != 0) { + if (vol->corrupted) { ubi_err("corrupted dynamic volume"); goto fail; } @@ -764,10 +826,6 @@ goto fail; } } else { - if (vol->corrupted != 0 && vol->corrupted != 1) { - ubi_err("bad corrupted"); - goto fail; - } if (vol->used_ebs < 0 || vol->used_ebs > vol->reserved_pebs) { ubi_err("bad used_ebs"); goto fail; @@ -796,33 +854,39 @@ if (alignment != vol->alignment || data_pad != vol->data_pad || upd_marker != vol->upd_marker || vol_type != vol->vol_type || - name_len!= vol->name_len || strncmp(name, vol->name, name_len)) { + name_len != vol->name_len || strncmp(name, vol->name, name_len)) { ubi_err("volume info is different"); goto fail; } spin_unlock(&ubi->volumes_lock); - return; + return 0; fail: ubi_err("paranoid check failed for volume %d", vol_id); - ubi_dbg_dump_vol_info(vol); + if (vol) + ubi_dbg_dump_vol_info(vol); ubi_dbg_dump_vtbl_record(&ubi->vtbl[vol_id], vol_id); spin_unlock(&ubi->volumes_lock); - BUG(); + return -EINVAL; } /** * paranoid_check_volumes - check information about all volumes. * @ubi: UBI device description object + * + * Returns zero if volumes are all right and a a negative error code if not. */ -static void paranoid_check_volumes(struct ubi_device *ubi) +static int paranoid_check_volumes(struct ubi_device *ubi) { - int i; + int i, err = 0; - mutex_lock(&ubi->vtbl_mutex); - for (i = 0; i < ubi->vtbl_slots; i++) - paranoid_check_volume(ubi, i); - mutex_unlock(&ubi->vtbl_mutex); + for (i = 0; i < ubi->vtbl_slots; i++) { + err = paranoid_check_volume(ubi, i); + if (err) + break; + } + + return err; } #endif diff -Nurd linux-2.6.24/drivers/mtd/ubi/vtbl.c ubifs-v2.6.24/drivers/mtd/ubi/vtbl.c --- linux-2.6.24/drivers/mtd/ubi/vtbl.c 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/vtbl.c 2009-04-07 17:14:47.000000000 +0200 @@ -86,8 +86,10 @@ { int i, err; uint32_t crc; + struct ubi_volume *layout_vol; ubi_assert(idx >= 0 && idx < ubi->vtbl_slots); + layout_vol = ubi->volumes[vol_id2idx(ubi, UBI_LAYOUT_VOLUME_ID)]; if (!vtbl_rec) vtbl_rec = &empty_vtbl_record; @@ -96,31 +98,75 @@ vtbl_rec->crc = cpu_to_be32(crc); } - mutex_lock(&ubi->vtbl_mutex); memcpy(&ubi->vtbl[idx], vtbl_rec, sizeof(struct ubi_vtbl_record)); for (i = 0; i < UBI_LAYOUT_VOLUME_EBS; i++) { - err = ubi_eba_unmap_leb(ubi, UBI_LAYOUT_VOL_ID, i); - if (err) { - mutex_unlock(&ubi->vtbl_mutex); + err = ubi_eba_unmap_leb(ubi, layout_vol, i); + if (err) return err; - } - err = ubi_eba_write_leb(ubi, UBI_LAYOUT_VOL_ID, i, ubi->vtbl, 0, + + err = ubi_eba_write_leb(ubi, layout_vol, i, ubi->vtbl, 0, ubi->vtbl_size, UBI_LONGTERM); - if (err) { - mutex_unlock(&ubi->vtbl_mutex); + if (err) return err; - } } paranoid_vtbl_check(ubi); - mutex_unlock(&ubi->vtbl_mutex); - return ubi_wl_flush(ubi); + return 0; } /** - * vol_til_check - check if volume table is not corrupted and contains sensible - * data. + * ubi_vtbl_rename_volumes - rename UBI volumes in the volume table. + * @ubi: UBI device description object + * @rename_list: list of &struct ubi_rename_entry objects * + * This function re-names multiple volumes specified in @req in the volume + * table. Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubi_vtbl_rename_volumes(struct ubi_device *ubi, + struct list_head *rename_list) +{ + int i, err; + struct ubi_rename_entry *re; + struct ubi_volume *layout_vol; + + list_for_each_entry(re, rename_list, list) { + uint32_t crc; + struct ubi_volume *vol = re->desc->vol; + struct ubi_vtbl_record *vtbl_rec = &ubi->vtbl[vol->vol_id]; + + if (re->remove) { + memcpy(vtbl_rec, &empty_vtbl_record, + sizeof(struct ubi_vtbl_record)); + continue; + } + + vtbl_rec->name_len = cpu_to_be16(re->new_name_len); + memcpy(vtbl_rec->name, re->new_name, re->new_name_len); + memset(vtbl_rec->name + re->new_name_len, 0, + UBI_VOL_NAME_MAX + 1 - re->new_name_len); + crc = crc32(UBI_CRC32_INIT, vtbl_rec, + UBI_VTBL_RECORD_SIZE_CRC); + vtbl_rec->crc = cpu_to_be32(crc); + } + + layout_vol = ubi->volumes[vol_id2idx(ubi, UBI_LAYOUT_VOLUME_ID)]; + for (i = 0; i < UBI_LAYOUT_VOLUME_EBS; i++) { + err = ubi_eba_unmap_leb(ubi, layout_vol, i); + if (err) + return err; + + err = ubi_eba_write_leb(ubi, layout_vol, i, ubi->vtbl, 0, + ubi->vtbl_size, UBI_LONGTERM); + if (err) + return err; + } + + return 0; +} + +/** + * vtbl_check - check if volume table is not corrupted and sensible. * @ubi: UBI device description object * @vtbl: volume table * @@ -131,7 +177,7 @@ const struct ubi_vtbl_record *vtbl) { int i, n, reserved_pebs, alignment, data_pad, vol_type, name_len; - int upd_marker; + int upd_marker, err; uint32_t crc; const char *name; @@ -157,7 +203,7 @@ if (reserved_pebs == 0) { if (memcmp(&vtbl[i], &empty_vtbl_record, UBI_VTBL_RECORD_SIZE)) { - dbg_err("bad empty record"); + err = 2; goto bad; } continue; @@ -165,56 +211,57 @@ if (reserved_pebs < 0 || alignment < 0 || data_pad < 0 || name_len < 0) { - dbg_err("negative values"); + err = 3; goto bad; } if (alignment > ubi->leb_size || alignment == 0) { - dbg_err("bad alignment"); + err = 4; goto bad; } - n = alignment % ubi->min_io_size; + n = alignment & (ubi->min_io_size - 1); if (alignment != 1 && n) { - dbg_err("alignment is not multiple of min I/O unit"); + err = 5; goto bad; } n = ubi->leb_size % alignment; if (data_pad != n) { dbg_err("bad data_pad, has to be %d", n); + err = 6; goto bad; } if (vol_type != UBI_VID_DYNAMIC && vol_type != UBI_VID_STATIC) { - dbg_err("bad vol_type"); + err = 7; goto bad; } if (upd_marker != 0 && upd_marker != 1) { - dbg_err("bad upd_marker"); + err = 8; goto bad; } if (reserved_pebs > ubi->good_peb_count) { - dbg_err("too large reserved_pebs, good PEBs %d", - ubi->good_peb_count); + dbg_err("too large reserved_pebs %d, good PEBs %d", + reserved_pebs, ubi->good_peb_count); + err = 9; goto bad; } if (name_len > UBI_VOL_NAME_MAX) { - dbg_err("too long volume name, max %d", - UBI_VOL_NAME_MAX); + err = 10; goto bad; } if (name[0] == '\0') { - dbg_err("NULL volume name"); + err = 11; goto bad; } if (name_len != strnlen(name, name_len + 1)) { - dbg_err("bad name_len"); + err = 12; goto bad; } } @@ -239,7 +286,7 @@ return 0; bad: - ubi_err("volume table check failed, record %d", i); + ubi_err("volume table check failed: record %d, error %d", i, err); ubi_dbg_dump_vtbl_record(&vtbl[i], i); return -EINVAL; } @@ -273,7 +320,7 @@ * this volume table copy was found during scanning. It has to be wiped * out. */ - sv = ubi_scan_find_sv(si, UBI_LAYOUT_VOL_ID); + sv = ubi_scan_find_sv(si, UBI_LAYOUT_VOLUME_ID); if (sv) old_seb = ubi_scan_find_seb(sv, copy); @@ -285,13 +332,12 @@ } vid_hdr->vol_type = UBI_VID_DYNAMIC; - vid_hdr->vol_id = cpu_to_be32(UBI_LAYOUT_VOL_ID); + vid_hdr->vol_id = cpu_to_be32(UBI_LAYOUT_VOLUME_ID); vid_hdr->compat = UBI_LAYOUT_VOLUME_COMPAT; vid_hdr->data_size = vid_hdr->used_ebs = vid_hdr->data_pad = cpu_to_be32(0); vid_hdr->lnum = cpu_to_be32(copy); vid_hdr->sqnum = cpu_to_be64(++si->max_sqnum); - vid_hdr->leb_ver = cpu_to_be32(old_seb ? old_seb->leb_ver + 1: 0); /* The EC header is already there, write the VID header */ err = ubi_io_write_vid_hdr(ubi, new_seb->pnum, vid_hdr); @@ -374,7 +420,7 @@ * to LEB 0. */ - dbg_msg("check layout volume"); + dbg_gen("check layout volume"); /* Read both LEB 0 and LEB 1 into memory */ ubi_rb_for_each_entry(rb, seb, &sv->root, u.rb) { @@ -388,7 +434,16 @@ err = ubi_io_read_data(ubi, leb[seb->lnum], seb->pnum, 0, ubi->vtbl_size); if (err == UBI_IO_BITFLIPS || err == -EBADMSG) - /* Scrub the PEB later */ + /* + * Scrub the PEB later. Note, -EBADMSG indicates an + * uncorrectable ECC error, but we have our own CRC and + * the data will be checked later. If the data is OK, + * the PEB will be scrubbed (because we set + * seb->scrub). If the data is not OK, the contents of + * the PEB will be recovered from the second copy, and + * seb->scrub will be cleared in + * 'ubi_scan_add_used()'. + */ seb->scrub = 1; else if (err) goto out_free; @@ -404,7 +459,8 @@ if (!leb_corrupted[0]) { /* LEB 0 is OK */ if (leb[1]) - leb_corrupted[1] = memcmp(leb[0], leb[1], ubi->vtbl_size); + leb_corrupted[1] = memcmp(leb[0], leb[1], + ubi->vtbl_size); if (leb_corrupted[1]) { ubi_warn("volume table copy #2 is corrupted"); err = create_vtbl(ubi, si, 1, leb[0]); @@ -518,6 +574,17 @@ vol->name[vol->name_len] = '\0'; vol->vol_id = i; + if (vtbl[i].flags & UBI_VTBL_AUTORESIZE_FLG) { + /* Auto re-size flag may be set only for one volume */ + if (ubi->autoresize_vol_id != -1) { + ubi_err("more then one auto-resize volume (%d " + "and %d)", ubi->autoresize_vol_id, i); + return -EINVAL; + } + + ubi->autoresize_vol_id = i; + } + ubi_assert(!ubi->volumes[i]); ubi->volumes[i] = vol; ubi->vol_count += 1; @@ -568,6 +635,7 @@ vol->last_eb_bytes = sv->last_data_size; } + /* And add the layout volume */ vol = kzalloc(sizeof(struct ubi_volume), GFP_KERNEL); if (!vol) return -ENOMEM; @@ -582,7 +650,8 @@ vol->last_eb_bytes = vol->reserved_pebs; vol->used_bytes = (long long)vol->used_ebs * (ubi->leb_size - vol->data_pad); - vol->vol_id = UBI_LAYOUT_VOL_ID; + vol->vol_id = UBI_LAYOUT_VOLUME_ID; + vol->ref_count = 1; ubi_assert(!ubi->volumes[i]); ubi->volumes[vol_id2idx(ubi, vol->vol_id)] = vol; @@ -610,30 +679,32 @@ static int check_sv(const struct ubi_volume *vol, const struct ubi_scan_volume *sv) { + int err; + if (sv->highest_lnum >= vol->reserved_pebs) { - dbg_err("bad highest_lnum"); + err = 1; goto bad; } if (sv->leb_count > vol->reserved_pebs) { - dbg_err("bad leb_count"); + err = 2; goto bad; } if (sv->vol_type != vol->vol_type) { - dbg_err("bad vol_type"); + err = 3; goto bad; } if (sv->used_ebs > vol->reserved_pebs) { - dbg_err("bad used_ebs"); + err = 4; goto bad; } if (sv->data_pad != vol->data_pad) { - dbg_err("bad data_pad"); + err = 5; goto bad; } return 0; bad: - ubi_err("bad scanning information"); + ubi_err("bad scanning information, error %d", err); ubi_dbg_dump_sv(sv); ubi_dbg_dump_vol_info(vol); return -EINVAL; @@ -662,14 +733,13 @@ return -EINVAL; } - if (si->highest_vol_id >= ubi->vtbl_slots + UBI_INT_VOL_COUNT&& + if (si->highest_vol_id >= ubi->vtbl_slots + UBI_INT_VOL_COUNT && si->highest_vol_id < UBI_INTERNAL_VOL_START) { ubi_err("too large volume ID %d found by scanning", si->highest_vol_id); return -EINVAL; } - for (i = 0; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) { cond_resched(); @@ -707,8 +777,7 @@ } /** - * ubi_read_volume_table - read volume table. - * information. + * ubi_read_volume_table - read the volume table. * @ubi: UBI device description object * @si: scanning information * @@ -734,7 +803,7 @@ ubi->vtbl_size = ubi->vtbl_slots * UBI_VTBL_RECORD_SIZE; ubi->vtbl_size = ALIGN(ubi->vtbl_size, ubi->min_io_size); - sv = ubi_scan_find_sv(si, UBI_LAYOUT_VOL_ID); + sv = ubi_scan_find_sv(si, UBI_LAYOUT_VOLUME_ID); if (!sv) { /* * No logical eraseblocks belonging to the layout volume were @@ -787,11 +856,10 @@ out_free: vfree(ubi->vtbl); - for (i = 0; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) - if (ubi->volumes[i]) { - kfree(ubi->volumes[i]); - ubi->volumes[i] = NULL; - } + for (i = 0; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) { + kfree(ubi->volumes[i]); + ubi->volumes[i] = NULL; + } return err; } diff -Nurd linux-2.6.24/drivers/mtd/ubi/wl.c ubifs-v2.6.24/drivers/mtd/ubi/wl.c --- linux-2.6.24/drivers/mtd/ubi/wl.c 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/drivers/mtd/ubi/wl.c 2009-04-07 17:14:47.000000000 +0200 @@ -19,22 +19,22 @@ */ /* - * UBI wear-leveling unit. + * UBI wear-leveling sub-system. * - * This unit is responsible for wear-leveling. It works in terms of physical - * eraseblocks and erase counters and knows nothing about logical eraseblocks, - * volumes, etc. From this unit's perspective all physical eraseblocks are of - * two types - used and free. Used physical eraseblocks are those that were - * "get" by the 'ubi_wl_get_peb()' function, and free physical eraseblocks are - * those that were put by the 'ubi_wl_put_peb()' function. + * This sub-system is responsible for wear-leveling. It works in terms of + * physical eraseblocks and erase counters and knows nothing about logical + * eraseblocks, volumes, etc. From this sub-system's perspective all physical + * eraseblocks are of two types - used and free. Used physical eraseblocks are + * those that were "get" by the 'ubi_wl_get_peb()' function, and free physical + * eraseblocks are those that were put by the 'ubi_wl_put_peb()' function. * * Physical eraseblocks returned by 'ubi_wl_get_peb()' have only erase counter - * header. The rest of the physical eraseblock contains only 0xFF bytes. + * header. The rest of the physical eraseblock contains only %0xFF bytes. * - * When physical eraseblocks are returned to the WL unit by means of the + * When physical eraseblocks are returned to the WL sub-system by means of the * 'ubi_wl_put_peb()' function, they are scheduled for erasure. The erasure is * done asynchronously in context of the per-UBI device background thread, - * which is also managed by the WL unit. + * which is also managed by the WL sub-system. * * The wear-leveling is ensured by means of moving the contents of used * physical eraseblocks with low erase counter to free physical eraseblocks @@ -43,34 +43,64 @@ * The 'ubi_wl_get_peb()' function accepts data type hints which help to pick * an "optimal" physical eraseblock. For example, when it is known that the * physical eraseblock will be "put" soon because it contains short-term data, - * the WL unit may pick a free physical eraseblock with low erase counter, and - * so forth. + * the WL sub-system may pick a free physical eraseblock with low erase + * counter, and so forth. * - * If the WL unit fails to erase a physical eraseblock, it marks it as bad. + * If the WL sub-system fails to erase a physical eraseblock, it marks it as + * bad. * - * This unit is also responsible for scrubbing. If a bit-flip is detected in a - * physical eraseblock, it has to be moved. Technically this is the same as - * moving it for wear-leveling reasons. + * This sub-system is also responsible for scrubbing. If a bit-flip is detected + * in a physical eraseblock, it has to be moved. Technically this is the same + * as moving it for wear-leveling reasons. * - * As it was said, for the UBI unit all physical eraseblocks are either "free" - * or "used". Free eraseblock are kept in the @wl->free RB-tree, while used - * eraseblocks are kept in a set of different RB-trees: @wl->used, - * @wl->prot.pnum, @wl->prot.aec, and @wl->scrub. + * As it was said, for the UBI sub-system all physical eraseblocks are either + * "free" or "used". Free eraseblock are kept in the @wl->free RB-tree, while + * used eraseblocks are kept in @wl->used or @wl->scrub RB-trees, or + * (temporarily) in the @wl->pq queue. + * + * When the WL sub-system returns a physical eraseblock, the physical + * eraseblock is protected from being moved for some "time". For this reason, + * the physical eraseblock is not directly moved from the @wl->free tree to the + * @wl->used tree. There is a protection queue in between where this + * physical eraseblock is temporarily stored (@wl->pq). + * + * All this protection stuff is needed because: + * o we don't want to move physical eraseblocks just after we have given them + * to the user; instead, we first want to let users fill them up with data; + * + * o there is a chance that the user will put the physical eraseblock very + * soon, so it makes sense not to move it for some time, but wait; this is + * especially important in case of "short term" physical eraseblocks. + * + * Physical eraseblocks stay protected only for limited time. But the "time" is + * measured in erase cycles in this case. This is implemented with help of the + * protection queue. Eraseblocks are put to the tail of this queue when they + * are returned by the 'ubi_wl_get_peb()', and eraseblocks are removed from the + * head of the queue on each erase operation (for any eraseblock). So the + * length of the queue defines how may (global) erase cycles PEBs are protected. + * + * To put it differently, each physical eraseblock has 2 main states: free and + * used. The former state corresponds to the @wl->free tree. The latter state + * is split up on several sub-states: + * o the WL movement is allowed (@wl->used tree); + * o the WL movement is temporarily prohibited (@wl->pq queue); + * o scrubbing is needed (@wl->scrub tree). + * + * Depending on the sub-state, wear-leveling entries of the used physical + * eraseblocks may be kept in one of those structures. * * Note, in this implementation, we keep a small in-RAM object for each physical * eraseblock. This is surely not a scalable solution. But it appears to be good * enough for moderately large flashes and it is simple. In future, one may - * re-work this unit and make it more scalable. + * re-work this sub-system and make it more scalable. * - * At the moment this unit does not utilize the sequence number, which was - * introduced relatively recently. But it would be wise to do this because the - * sequence number of a logical eraseblock characterizes how old is it. For + * At the moment this sub-system does not utilize the sequence number, which + * was introduced relatively recently. But it would be wise to do this because + * the sequence number of a logical eraseblock characterizes how old is it. For * example, when we move a PEB with low erase counter, and we need to pick the * target PEB, we pick a PEB with the highest EC if our PEB is "old" and we * pick target PEB with an average EC if our PEB is not very "old". This is a - * room for future re-works of the WL unit. - * - * FIXME: looks too complex, should be simplified (later). + * room for future re-works of the WL sub-system. */ #include @@ -83,29 +113,22 @@ #define WL_RESERVED_PEBS 1 /* - * How many erase cycles are short term, unknown, and long term physical - * eraseblocks protected. - */ -#define ST_PROTECTION 16 -#define U_PROTECTION 10 -#define LT_PROTECTION 4 - -/* * Maximum difference between two erase counters. If this threshold is - * exceeded, the WL unit starts moving data from used physical eraseblocks with - * low erase counter to free physical eraseblocks with high erase counter. + * exceeded, the WL sub-system starts moving data from used physical + * eraseblocks with low erase counter to free physical eraseblocks with high + * erase counter. */ #define UBI_WL_THRESHOLD CONFIG_MTD_UBI_WL_THRESHOLD /* - * When a physical eraseblock is moved, the WL unit has to pick the target + * When a physical eraseblock is moved, the WL sub-system has to pick the target * physical eraseblock to move to. The simplest way would be just to pick the * one with the highest erase counter. But in certain workloads this could lead * to an unlimited wear of one or few physical eraseblock. Indeed, imagine a * situation when the picked physical eraseblock is constantly erased after the * data is written to it. So, we have a constant which limits the highest erase - * counter of the free physical eraseblock to pick. Namely, the WL unit does - * not pick eraseblocks with erase counter greater then the lowest erase + * counter of the free physical eraseblock to pick. Namely, the WL sub-system + * does not pick eraseblocks with erase counter greater then the lowest erase * counter plus %WL_FREE_MAX_DIFF. */ #define WL_FREE_MAX_DIFF (2*UBI_WL_THRESHOLD) @@ -117,80 +140,9 @@ #define WL_MAX_FAILURES 32 /** - * struct ubi_wl_entry - wear-leveling entry. - * @rb: link in the corresponding RB-tree - * @ec: erase counter - * @pnum: physical eraseblock number - * - * Each physical eraseblock has a corresponding &struct wl_entry object which - * may be kept in different RB-trees. - */ -struct ubi_wl_entry { - struct rb_node rb; - int ec; - int pnum; -}; - -/** - * struct ubi_wl_prot_entry - PEB protection entry. - * @rb_pnum: link in the @wl->prot.pnum RB-tree - * @rb_aec: link in the @wl->prot.aec RB-tree - * @abs_ec: the absolute erase counter value when the protection ends - * @e: the wear-leveling entry of the physical eraseblock under protection - * - * When the WL unit returns a physical eraseblock, the physical eraseblock is - * protected from being moved for some "time". For this reason, the physical - * eraseblock is not directly moved from the @wl->free tree to the @wl->used - * tree. There is one more tree in between where this physical eraseblock is - * temporarily stored (@wl->prot). - * - * All this protection stuff is needed because: - * o we don't want to move physical eraseblocks just after we have given them - * to the user; instead, we first want to let users fill them up with data; - * - * o there is a chance that the user will put the physical eraseblock very - * soon, so it makes sense not to move it for some time, but wait; this is - * especially important in case of "short term" physical eraseblocks. - * - * Physical eraseblocks stay protected only for limited time. But the "time" is - * measured in erase cycles in this case. This is implemented with help of the - * absolute erase counter (@wl->abs_ec). When it reaches certain value, the - * physical eraseblocks are moved from the protection trees (@wl->prot.*) to - * the @wl->used tree. - * - * Protected physical eraseblocks are searched by physical eraseblock number - * (when they are put) and by the absolute erase counter (to check if it is - * time to move them to the @wl->used tree). So there are actually 2 RB-trees - * storing the protected physical eraseblocks: @wl->prot.pnum and - * @wl->prot.aec. They are referred to as the "protection" trees. The - * first one is indexed by the physical eraseblock number. The second one is - * indexed by the absolute erase counter. Both trees store - * &struct ubi_wl_prot_entry objects. - * - * Each physical eraseblock has 2 main states: free and used. The former state - * corresponds to the @wl->free tree. The latter state is split up on several - * sub-states: - * o the WL movement is allowed (@wl->used tree); - * o the WL movement is temporarily prohibited (@wl->prot.pnum and - * @wl->prot.aec trees); - * o scrubbing is needed (@wl->scrub tree). - * - * Depending on the sub-state, wear-leveling entries of the used physical - * eraseblocks may be kept in one of those trees. - */ -struct ubi_wl_prot_entry { - struct rb_node rb_pnum; - struct rb_node rb_aec; - unsigned long long abs_ec; - struct ubi_wl_entry *e; -}; - -/** * struct ubi_work - UBI work description data structure. * @list: a link in the list of pending works * @func: worker function - * @priv: private data of the worker function - * * @e: physical eraseblock to erase * @torture: if the physical eraseblock has to be tortured * @@ -211,14 +163,13 @@ static int paranoid_check_ec(struct ubi_device *ubi, int pnum, int ec); static int paranoid_check_in_wl_tree(struct ubi_wl_entry *e, struct rb_root *root); +static int paranoid_check_in_pq(struct ubi_device *ubi, struct ubi_wl_entry *e); #else #define paranoid_check_ec(ubi, pnum, ec) 0 #define paranoid_check_in_wl_tree(e, root) +#define paranoid_check_in_pq(ubi, e) 0 #endif -/* Slab cache for wear-leveling entries */ -static struct kmem_cache *wl_entries_slab; - /** * wl_tree_add - add a wear-leveling entry to a WL RB-tree. * @e: the wear-leveling entry to add @@ -236,7 +187,7 @@ struct ubi_wl_entry *e1; parent = *p; - e1 = rb_entry(parent, struct ubi_wl_entry, rb); + e1 = rb_entry(parent, struct ubi_wl_entry, u.rb); if (e->ec < e1->ec) p = &(*p)->rb_left; @@ -251,8 +202,8 @@ } } - rb_link_node(&e->rb, parent, p); - rb_insert_color(&e->rb, root); + rb_link_node(&e->u.rb, parent, p); + rb_insert_color(&e->u.rb, root); } /** @@ -267,15 +218,26 @@ int err; struct ubi_work *wrk; - spin_lock(&ubi->wl_lock); + cond_resched(); + /* + * @ubi->work_sem is used to synchronize with the workers. Workers take + * it in read mode, so many of them may be doing works at a time. But + * the queue flush code has to be sure the whole queue of works is + * done, and it takes the mutex in write mode. + */ + down_read(&ubi->work_sem); + spin_lock(&ubi->wl_lock); if (list_empty(&ubi->works)) { spin_unlock(&ubi->wl_lock); + up_read(&ubi->work_sem); return 0; } wrk = list_entry(ubi->works.next, struct ubi_work, list); list_del(&wrk->list); + ubi->works_count -= 1; + ubi_assert(ubi->works_count >= 0); spin_unlock(&ubi->wl_lock); /* @@ -286,11 +248,8 @@ err = wrk->func(ubi, wrk, 0); if (err) ubi_err("work failed with error code %d", err); + up_read(&ubi->work_sem); - spin_lock(&ubi->wl_lock); - ubi->works_count -= 1; - ubi_assert(ubi->works_count >= 0); - spin_unlock(&ubi->wl_lock); return err; } @@ -339,7 +298,7 @@ while (p) { struct ubi_wl_entry *e1; - e1 = rb_entry(p, struct ubi_wl_entry, rb); + e1 = rb_entry(p, struct ubi_wl_entry, u.rb); if (e->pnum == e1->pnum) { ubi_assert(e == e1); @@ -363,50 +322,24 @@ } /** - * prot_tree_add - add physical eraseblock to protection trees. + * prot_queue_add - add physical eraseblock to the protection queue. * @ubi: UBI device description object * @e: the physical eraseblock to add - * @pe: protection entry object to use - * @abs_ec: absolute erase counter value when this physical eraseblock has - * to be removed from the protection trees. * - * @wl->lock has to be locked. + * This function adds @e to the tail of the protection queue @ubi->pq, where + * @e will stay for %UBI_PROT_QUEUE_LEN erase operations and will be + * temporarily protected from the wear-leveling worker. Note, @wl->lock has to + * be locked. */ -static void prot_tree_add(struct ubi_device *ubi, struct ubi_wl_entry *e, - struct ubi_wl_prot_entry *pe, int abs_ec) +static void prot_queue_add(struct ubi_device *ubi, struct ubi_wl_entry *e) { - struct rb_node **p, *parent = NULL; - struct ubi_wl_prot_entry *pe1; - - pe->e = e; - pe->abs_ec = ubi->abs_ec + abs_ec; - - p = &ubi->prot.pnum.rb_node; - while (*p) { - parent = *p; - pe1 = rb_entry(parent, struct ubi_wl_prot_entry, rb_pnum); - - if (e->pnum < pe1->e->pnum) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - rb_link_node(&pe->rb_pnum, parent, p); - rb_insert_color(&pe->rb_pnum, &ubi->prot.pnum); - - p = &ubi->prot.aec.rb_node; - parent = NULL; - while (*p) { - parent = *p; - pe1 = rb_entry(parent, struct ubi_wl_prot_entry, rb_aec); + int pq_tail = ubi->pq_head - 1; - if (pe->abs_ec < pe1->abs_ec) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - rb_link_node(&pe->rb_aec, parent, p); - rb_insert_color(&pe->rb_aec, &ubi->prot.aec); + if (pq_tail < 0) + pq_tail = UBI_PROT_QUEUE_LEN - 1; + ubi_assert(pq_tail >= 0 && pq_tail < UBI_PROT_QUEUE_LEN); + list_add_tail(&e->u.list, &ubi->pq[pq_tail]); + dbg_wl("added PEB %d EC %d to the protection queue", e->pnum, e->ec); } /** @@ -422,14 +355,14 @@ struct rb_node *p; struct ubi_wl_entry *e; - e = rb_entry(rb_first(root), struct ubi_wl_entry, rb); + e = rb_entry(rb_first(root), struct ubi_wl_entry, u.rb); max += e->ec; p = root->rb_node; while (p) { struct ubi_wl_entry *e1; - e1 = rb_entry(p, struct ubi_wl_entry, rb); + e1 = rb_entry(p, struct ubi_wl_entry, u.rb); if (e1->ec >= max) p = p->rb_left; else { @@ -451,17 +384,12 @@ */ int ubi_wl_get_peb(struct ubi_device *ubi, int dtype) { - int err, protect, medium_ec; + int err, medium_ec; struct ubi_wl_entry *e, *first, *last; - struct ubi_wl_prot_entry *pe; ubi_assert(dtype == UBI_LONGTERM || dtype == UBI_SHORTTERM || dtype == UBI_UNKNOWN); - pe = kmalloc(sizeof(struct ubi_wl_prot_entry), GFP_NOFS); - if (!pe) - return -ENOMEM; - retry: spin_lock(&ubi->wl_lock); if (!ubi->free.rb_node) { @@ -469,110 +397,91 @@ ubi_assert(list_empty(&ubi->works)); ubi_err("no free eraseblocks"); spin_unlock(&ubi->wl_lock); - kfree(pe); return -ENOSPC; } spin_unlock(&ubi->wl_lock); err = produce_free_peb(ubi); - if (err < 0) { - kfree(pe); + if (err < 0) return err; - } goto retry; } switch (dtype) { - case UBI_LONGTERM: - /* - * For long term data we pick a physical eraseblock - * with high erase counter. But the highest erase - * counter we can pick is bounded by the the lowest - * erase counter plus %WL_FREE_MAX_DIFF. - */ - e = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF); - protect = LT_PROTECTION; - break; - case UBI_UNKNOWN: - /* - * For unknown data we pick a physical eraseblock with - * medium erase counter. But we by no means can pick a - * physical eraseblock with erase counter greater or - * equivalent than the lowest erase counter plus - * %WL_FREE_MAX_DIFF. - */ - first = rb_entry(rb_first(&ubi->free), - struct ubi_wl_entry, rb); - last = rb_entry(rb_last(&ubi->free), - struct ubi_wl_entry, rb); + case UBI_LONGTERM: + /* + * For long term data we pick a physical eraseblock with high + * erase counter. But the highest erase counter we can pick is + * bounded by the the lowest erase counter plus + * %WL_FREE_MAX_DIFF. + */ + e = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF); + break; + case UBI_UNKNOWN: + /* + * For unknown data we pick a physical eraseblock with medium + * erase counter. But we by no means can pick a physical + * eraseblock with erase counter greater or equivalent than the + * lowest erase counter plus %WL_FREE_MAX_DIFF. + */ + first = rb_entry(rb_first(&ubi->free), struct ubi_wl_entry, + u.rb); + last = rb_entry(rb_last(&ubi->free), struct ubi_wl_entry, u.rb); - if (last->ec - first->ec < WL_FREE_MAX_DIFF) - e = rb_entry(ubi->free.rb_node, - struct ubi_wl_entry, rb); - else { - medium_ec = (first->ec + WL_FREE_MAX_DIFF)/2; - e = find_wl_entry(&ubi->free, medium_ec); - } - protect = U_PROTECTION; - break; - case UBI_SHORTTERM: - /* - * For short term data we pick a physical eraseblock - * with the lowest erase counter as we expect it will - * be erased soon. - */ - e = rb_entry(rb_first(&ubi->free), - struct ubi_wl_entry, rb); - protect = ST_PROTECTION; - break; - default: - protect = 0; - e = NULL; - BUG(); + if (last->ec - first->ec < WL_FREE_MAX_DIFF) + e = rb_entry(ubi->free.rb_node, + struct ubi_wl_entry, u.rb); + else { + medium_ec = (first->ec + WL_FREE_MAX_DIFF)/2; + e = find_wl_entry(&ubi->free, medium_ec); + } + break; + case UBI_SHORTTERM: + /* + * For short term data we pick a physical eraseblock with the + * lowest erase counter as we expect it will be erased soon. + */ + e = rb_entry(rb_first(&ubi->free), struct ubi_wl_entry, u.rb); + break; + default: + BUG(); } + paranoid_check_in_wl_tree(e, &ubi->free); + /* - * Move the physical eraseblock to the protection trees where it will + * Move the physical eraseblock to the protection queue where it will * be protected from being moved for some time. */ - paranoid_check_in_wl_tree(e, &ubi->free); - rb_erase(&e->rb, &ubi->free); - prot_tree_add(ubi, e, pe, protect); - - dbg_wl("PEB %d EC %d, protection %d", e->pnum, e->ec, protect); + rb_erase(&e->u.rb, &ubi->free); + dbg_wl("PEB %d EC %d", e->pnum, e->ec); + prot_queue_add(ubi, e); spin_unlock(&ubi->wl_lock); - return e->pnum; } /** - * prot_tree_del - remove a physical eraseblock from the protection trees + * prot_queue_del - remove a physical eraseblock from the protection queue. * @ubi: UBI device description object * @pnum: the physical eraseblock to remove + * + * This function deletes PEB @pnum from the protection queue and returns zero + * in case of success and %-ENODEV if the PEB was not found. */ -static void prot_tree_del(struct ubi_device *ubi, int pnum) +static int prot_queue_del(struct ubi_device *ubi, int pnum) { - struct rb_node *p; - struct ubi_wl_prot_entry *pe = NULL; - - p = ubi->prot.pnum.rb_node; - while (p) { - - pe = rb_entry(p, struct ubi_wl_prot_entry, rb_pnum); + struct ubi_wl_entry *e; - if (pnum == pe->e->pnum) - break; + e = ubi->lookuptbl[pnum]; + if (!e) + return -ENODEV; - if (pnum < pe->e->pnum) - p = p->rb_left; - else - p = p->rb_right; - } + if (paranoid_check_in_pq(ubi, e)) + return -ENODEV; - ubi_assert(pe->e->pnum == pnum); - rb_erase(&pe->rb_aec, &ubi->prot.aec); - rb_erase(&pe->rb_pnum, &ubi->prot.pnum); - kfree(pe); + list_del(&e->u.list); + dbg_wl("deleted PEB %d from the protection queue", e->pnum); + return 0; } /** @@ -584,7 +493,8 @@ * This function returns zero in case of success and a negative error code in * case of failure. */ -static int sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, int torture) +static int sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e, + int torture) { int err; struct ubi_ec_hdr *ec_hdr; @@ -636,48 +546,47 @@ } /** - * check_protection_over - check if it is time to stop protecting some - * physical eraseblocks. + * serve_prot_queue - check if it is time to stop protecting PEBs. * @ubi: UBI device description object * - * This function is called after each erase operation, when the absolute erase - * counter is incremented, to check if some physical eraseblock have not to be - * protected any longer. These physical eraseblocks are moved from the - * protection trees to the used tree. + * This function is called after each erase operation and removes PEBs from the + * tail of the protection queue. These PEBs have been protected for long enough + * and should be moved to the used tree. */ -static void check_protection_over(struct ubi_device *ubi) +static void serve_prot_queue(struct ubi_device *ubi) { - struct ubi_wl_prot_entry *pe; + struct ubi_wl_entry *e, *tmp; + int count; /* * There may be several protected physical eraseblock to remove, * process them all. */ - while (1) { - spin_lock(&ubi->wl_lock); - if (!ubi->prot.aec.rb_node) { - spin_unlock(&ubi->wl_lock); - break; - } - - pe = rb_entry(rb_first(&ubi->prot.aec), - struct ubi_wl_prot_entry, rb_aec); +repeat: + count = 0; + spin_lock(&ubi->wl_lock); + list_for_each_entry_safe(e, tmp, &ubi->pq[ubi->pq_head], u.list) { + dbg_wl("PEB %d EC %d protection over, move to used tree", + e->pnum, e->ec); - if (pe->abs_ec > ubi->abs_ec) { + list_del(&e->u.list); + wl_tree_add(e, &ubi->used); + if (count++ > 32) { + /* + * Let's be nice and avoid holding the spinlock for + * too long. + */ spin_unlock(&ubi->wl_lock); - break; + cond_resched(); + goto repeat; } - - dbg_wl("PEB %d protection over, abs_ec %llu, PEB abs_ec %llu", - pe->e->pnum, ubi->abs_ec, pe->abs_ec); - rb_erase(&pe->rb_aec, &ubi->prot.aec); - rb_erase(&pe->rb_pnum, &ubi->prot.pnum); - wl_tree_add(pe->e, &ubi->used); - spin_unlock(&ubi->wl_lock); - - kfree(pe); - cond_resched(); } + + ubi->pq_head += 1; + if (ubi->pq_head == UBI_PROT_QUEUE_LEN) + ubi->pq_head = 0; + ubi_assert(ubi->pq_head >= 0 && ubi->pq_head < UBI_PROT_QUEUE_LEN); + spin_unlock(&ubi->wl_lock); } /** @@ -685,8 +594,8 @@ * @ubi: UBI device description object * @wrk: the work to schedule * - * This function enqueues a work defined by @wrk to the tail of the pending - * works list. + * This function adds a work defined by @wrk to the tail of the pending works + * list. */ static void schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk) { @@ -744,12 +653,11 @@ static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk, int cancel) { - int err, put = 0; + int err, scrubbing = 0, torture = 0; struct ubi_wl_entry *e1, *e2; struct ubi_vid_hdr *vid_hdr; kfree(wrk); - if (cancel) return 0; @@ -757,21 +665,17 @@ if (!vid_hdr) return -ENOMEM; + mutex_lock(&ubi->move_mutex); spin_lock(&ubi->wl_lock); + ubi_assert(!ubi->move_from && !ubi->move_to); + ubi_assert(!ubi->move_to_put); - /* - * Only one WL worker at a time is supported at this implementation, so - * make sure a PEB is not being moved already. - */ - if (ubi->move_to || !ubi->free.rb_node || + if (!ubi->free.rb_node || (!ubi->used.rb_node && !ubi->scrub.rb_node)) { /* - * Only one WL worker at a time is supported at this - * implementation, so if a LEB is already being moved, cancel. - * - * No free physical eraseblocks? Well, we cancel wear-leveling - * then. It will be triggered again when a free physical - * eraseblock appears. + * No free physical eraseblocks? Well, they must be waiting in + * the queue to be erased. Cancel movement - it will be + * triggered again when a free physical eraseblock appears. * * No used physical eraseblocks? They must be temporarily * protected from being moved. They will be moved to the @@ -780,10 +684,7 @@ */ dbg_wl("cancel WL, a list is empty: free %d, used %d", !ubi->free.rb_node, !ubi->used.rb_node); - ubi->wl_scheduled = 0; - spin_unlock(&ubi->wl_lock); - ubi_free_vid_hdr(ubi, vid_hdr); - return 0; + goto out_cancel; } if (!ubi->scrub.rb_node) { @@ -792,33 +693,30 @@ * highly worn-out free physical eraseblock. If the erase * counters differ much enough, start wear-leveling. */ - e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, rb); + e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb); e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF); if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) { dbg_wl("no WL needed: min used EC %d, max free EC %d", e1->ec, e2->ec); - ubi->wl_scheduled = 0; - spin_unlock(&ubi->wl_lock); - ubi_free_vid_hdr(ubi, vid_hdr); - return 0; + goto out_cancel; } paranoid_check_in_wl_tree(e1, &ubi->used); - rb_erase(&e1->rb, &ubi->used); + rb_erase(&e1->u.rb, &ubi->used); dbg_wl("move PEB %d EC %d to PEB %d EC %d", e1->pnum, e1->ec, e2->pnum, e2->ec); } else { - e1 = rb_entry(rb_first(&ubi->scrub), struct ubi_wl_entry, rb); + /* Perform scrubbing */ + scrubbing = 1; + e1 = rb_entry(rb_first(&ubi->scrub), struct ubi_wl_entry, u.rb); e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF); paranoid_check_in_wl_tree(e1, &ubi->scrub); - rb_erase(&e1->rb, &ubi->scrub); + rb_erase(&e1->u.rb, &ubi->scrub); dbg_wl("scrub PEB %d to PEB %d", e1->pnum, e2->pnum); } paranoid_check_in_wl_tree(e2, &ubi->free); - rb_erase(&e2->rb, &ubi->free); - ubi_assert(!ubi->move_from && !ubi->move_to); - ubi_assert(!ubi->move_to_put && !ubi->move_from_put); + rb_erase(&e2->u.rb, &ubi->free); ubi->move_from = e1; ubi->move_to = e2; spin_unlock(&ubi->wl_lock); @@ -828,6 +726,10 @@ * We so far do not know which logical eraseblock our physical * eraseblock (@e1) belongs to. We have to read the volume identifier * header first. + * + * Note, we are protected from this PEB being unmapped and erased. The + * 'ubi_wl_put_peb()' would wait for moving to be finished if the PEB + * which is being moved was unmapped. */ err = ubi_io_read_vid_hdr(ubi, e1->pnum, vid_hdr, 0); @@ -842,97 +744,145 @@ * likely have the VID header in place. */ dbg_wl("PEB %d has no VID header", e1->pnum); - err = 0; - } else { - ubi_err("error %d while reading VID header from PEB %d", - err, e1->pnum); - if (err > 0) - err = -EIO; + goto out_not_moved; } - goto error; + + ubi_err("error %d while reading VID header from PEB %d", + err, e1->pnum); + if (err > 0) + err = -EIO; + goto out_error; } err = ubi_eba_copy_leb(ubi, e1->pnum, e2->pnum, vid_hdr); if (err) { - if (err == UBI_IO_BITFLIPS) - err = 0; - goto error; + if (err == -EAGAIN) + goto out_not_moved; + if (err < 0) + goto out_error; + if (err == 2) { + /* Target PEB write error, torture it */ + torture = 1; + goto out_not_moved; + } + + /* + * The LEB has not been moved because the volume is being + * deleted or the PEB has been put meanwhile. We should prevent + * this PEB from being selected for wear-leveling movement + * again, so put it to the protection queue. + */ + + dbg_wl("canceled moving PEB %d", e1->pnum); + ubi_assert(err == 1); + + ubi_free_vid_hdr(ubi, vid_hdr); + vid_hdr = NULL; + + spin_lock(&ubi->wl_lock); + prot_queue_add(ubi, e1); + ubi_assert(!ubi->move_to_put); + ubi->move_from = ubi->move_to = NULL; + ubi->wl_scheduled = 0; + spin_unlock(&ubi->wl_lock); + + e1 = NULL; + err = schedule_erase(ubi, e2, 0); + if (err) + goto out_error; + mutex_unlock(&ubi->move_mutex); + return 0; } + /* The PEB has been successfully moved */ ubi_free_vid_hdr(ubi, vid_hdr); + vid_hdr = NULL; + if (scrubbing) + ubi_msg("scrubbed PEB %d, data moved to PEB %d", + e1->pnum, e2->pnum); + spin_lock(&ubi->wl_lock); - if (!ubi->move_to_put) + if (!ubi->move_to_put) { wl_tree_add(e2, &ubi->used); - else - put = 1; + e2 = NULL; + } ubi->move_from = ubi->move_to = NULL; - ubi->move_from_put = ubi->move_to_put = 0; - ubi->wl_scheduled = 0; + ubi->move_to_put = ubi->wl_scheduled = 0; spin_unlock(&ubi->wl_lock); - if (put) { + err = schedule_erase(ubi, e1, 0); + if (err) { + e1 = NULL; + goto out_error; + } + + if (e2) { /* * Well, the target PEB was put meanwhile, schedule it for * erasure. */ dbg_wl("PEB %d was put meanwhile, erase", e2->pnum); err = schedule_erase(ubi, e2, 0); - if (err) { - kmem_cache_free(wl_entries_slab, e2); - ubi_ro_mode(ubi); - } - } - - err = schedule_erase(ubi, e1, 0); - if (err) { - kmem_cache_free(wl_entries_slab, e1); - ubi_ro_mode(ubi); + if (err) + goto out_error; } dbg_wl("done"); - return err; + mutex_unlock(&ubi->move_mutex); + return 0; /* - * Some error occurred. @e1 was not changed, so return it back. @e2 - * might be changed, schedule it for erasure. + * For some reasons the LEB was not moved, might be an error, might be + * something else. @e1 was not changed, so return it back. @e2 might + * have been changed, schedule it for erasure. */ -error: - if (err) - dbg_wl("error %d occurred, cancel operation", err); - ubi_assert(err <= 0); - +out_not_moved: + dbg_wl("canceled moving PEB %d", e1->pnum); ubi_free_vid_hdr(ubi, vid_hdr); + vid_hdr = NULL; spin_lock(&ubi->wl_lock); - ubi->wl_scheduled = 0; - if (ubi->move_from_put) - put = 1; + if (scrubbing) + wl_tree_add(e1, &ubi->scrub); else wl_tree_add(e1, &ubi->used); + ubi_assert(!ubi->move_to_put); ubi->move_from = ubi->move_to = NULL; - ubi->move_from_put = ubi->move_to_put = 0; + ubi->wl_scheduled = 0; spin_unlock(&ubi->wl_lock); - if (put) { - /* - * Well, the target PEB was put meanwhile, schedule it for - * erasure. - */ - dbg_wl("PEB %d was put meanwhile, erase", e1->pnum); - err = schedule_erase(ubi, e1, 0); - if (err) { - kmem_cache_free(wl_entries_slab, e1); - ubi_ro_mode(ubi); - } - } + e1 = NULL; + err = schedule_erase(ubi, e2, torture); + if (err) + goto out_error; - err = schedule_erase(ubi, e2, 0); - if (err) { - kmem_cache_free(wl_entries_slab, e2); - ubi_ro_mode(ubi); - } + mutex_unlock(&ubi->move_mutex); + return 0; - yield(); +out_error: + ubi_err("error %d while moving PEB %d to PEB %d", + err, e1->pnum, e2->pnum); + + ubi_free_vid_hdr(ubi, vid_hdr); + spin_lock(&ubi->wl_lock); + ubi->move_from = ubi->move_to = NULL; + ubi->move_to_put = ubi->wl_scheduled = 0; + spin_unlock(&ubi->wl_lock); + + if (e1) + kmem_cache_free(ubi_wl_entry_slab, e1); + if (e2) + kmem_cache_free(ubi_wl_entry_slab, e2); + ubi_ro_mode(ubi); + + mutex_unlock(&ubi->move_mutex); return err; + +out_cancel: + ubi->wl_scheduled = 0; + spin_unlock(&ubi->wl_lock); + mutex_unlock(&ubi->move_mutex); + ubi_free_vid_hdr(ubi, vid_hdr); + return 0; } /** @@ -970,7 +920,7 @@ * erase counter of free physical eraseblocks is greater then * %UBI_WL_THRESHOLD. */ - e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, rb); + e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb); e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF); if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) @@ -1020,7 +970,7 @@ if (cancel) { dbg_wl("cancel erasure of PEB %d EC %d", pnum, e->ec); kfree(wl_wrk); - kmem_cache_free(wl_entries_slab, e); + kmem_cache_free(ubi_wl_entry_slab, e); return 0; } @@ -1032,15 +982,14 @@ kfree(wl_wrk); spin_lock(&ubi->wl_lock); - ubi->abs_ec += 1; wl_tree_add(e, &ubi->free); spin_unlock(&ubi->wl_lock); /* - * One more erase operation has happened, take care about protected - * physical eraseblocks. + * One more erase operation has happened, take care about + * protected physical eraseblocks. */ - check_protection_over(ubi); + serve_prot_queue(ubi); /* And take care about wear-leveling */ err = ensure_wear_leveling(ubi); @@ -1049,7 +998,7 @@ ubi_err("failed to erase PEB %d, error %d", pnum, err); kfree(wl_wrk); - kmem_cache_free(wl_entries_slab, e); + kmem_cache_free(ubi_wl_entry_slab, e); if (err == -EINTR || err == -ENOMEM || err == -EAGAIN || err == -EBUSY) { @@ -1119,8 +1068,7 @@ } /** - * ubi_wl_put_peb - return a physical eraseblock to the wear-leveling - * unit. + * ubi_wl_put_peb - return a PEB to the wear-leveling sub-system. * @ubi: UBI device description object * @pnum: physical eraseblock to return * @torture: if this physical eraseblock has to be tortured @@ -1128,7 +1076,7 @@ * This function is called to return physical eraseblock @pnum to the pool of * free physical eraseblocks. The @torture flag has to be set if an I/O error * occurred to this @pnum and it has to be tested. This function returns zero - * in case of success and a negative error code in case of failure. + * in case of success, and a negative error code in case of failure. */ int ubi_wl_put_peb(struct ubi_device *ubi, int pnum, int torture) { @@ -1139,8 +1087,8 @@ ubi_assert(pnum >= 0); ubi_assert(pnum < ubi->peb_count); +retry: spin_lock(&ubi->wl_lock); - e = ubi->lookuptbl[pnum]; if (e == ubi->move_from) { /* @@ -1148,17 +1096,22 @@ * be moved. It will be scheduled for erasure in the * wear-leveling worker. */ - dbg_wl("PEB %d is being moved", pnum); - ubi_assert(!ubi->move_from_put); - ubi->move_from_put = 1; + dbg_wl("PEB %d is being moved, wait", pnum); spin_unlock(&ubi->wl_lock); - return 0; + + /* Wait for the WL worker by taking the @ubi->move_mutex */ + mutex_lock(&ubi->move_mutex); + mutex_unlock(&ubi->move_mutex); + goto retry; } else if (e == ubi->move_to) { /* * User is putting the physical eraseblock which was selected * as the target the data is moved to. It may happen if the EBA - * unit already re-mapped the LEB but the WL unit did has not - * put the PEB to the "used" tree. + * sub-system already re-mapped the LEB in 'ubi_eba_copy_leb()' + * but the WL sub-system has not put the PEB to the "used" tree + * yet, but it is about to do this. So we just set a flag which + * will tell the WL worker that the PEB is not needed anymore + * and should be scheduled for erasure. */ dbg_wl("PEB %d is the target of data moving", pnum); ubi_assert(!ubi->move_to_put); @@ -1168,12 +1121,19 @@ } else { if (in_wl_tree(e, &ubi->used)) { paranoid_check_in_wl_tree(e, &ubi->used); - rb_erase(&e->rb, &ubi->used); + rb_erase(&e->u.rb, &ubi->used); } else if (in_wl_tree(e, &ubi->scrub)) { paranoid_check_in_wl_tree(e, &ubi->scrub); - rb_erase(&e->rb, &ubi->scrub); - } else - prot_tree_del(ubi, e->pnum); + rb_erase(&e->u.rb, &ubi->scrub); + } else { + err = prot_queue_del(ubi, e->pnum); + if (err) { + ubi_err("PEB %d not found", pnum); + ubi_ro_mode(ubi); + spin_unlock(&ubi->wl_lock); + return err; + } + } } spin_unlock(&ubi->wl_lock); @@ -1201,7 +1161,7 @@ { struct ubi_wl_entry *e; - ubi_msg("schedule PEB %d for scrubbing", pnum); + dbg_msg("schedule PEB %d for scrubbing", pnum); retry: spin_lock(&ubi->wl_lock); @@ -1226,9 +1186,18 @@ if (in_wl_tree(e, &ubi->used)) { paranoid_check_in_wl_tree(e, &ubi->used); - rb_erase(&e->rb, &ubi->used); - } else - prot_tree_del(ubi, pnum); + rb_erase(&e->u.rb, &ubi->used); + } else { + int err; + + err = prot_queue_del(ubi, e->pnum); + if (err) { + ubi_err("PEB %d not found", pnum); + ubi_ro_mode(ubi); + spin_unlock(&ubi->wl_lock); + return err; + } + } wl_tree_add(e, &ubi->scrub); spin_unlock(&ubi->wl_lock); @@ -1249,17 +1218,32 @@ */ int ubi_wl_flush(struct ubi_device *ubi) { - int err, pending_count; + int err; - pending_count = ubi->works_count; + /* + * Erase while the pending works queue is not empty, but not more than + * the number of currently pending works. + */ + dbg_wl("flush (%d pending works)", ubi->works_count); + while (ubi->works_count) { + err = do_work(ubi); + if (err) + return err; + } - dbg_wl("flush (%d pending works)", pending_count); + /* + * Make sure all the works which have been done in parallel are + * finished. + */ + down_write(&ubi->work_sem); + up_write(&ubi->work_sem); /* - * Erase while the pending works queue is not empty, but not more then - * the number of currently pending works. + * And in case last was the WL worker and it canceled the LEB + * movement, flush again. */ - while (pending_count-- > 0) { + while (ubi->works_count) { + dbg_wl("flush more (%d pending works)", ubi->works_count); err = do_work(ubi); if (err) return err; @@ -1284,17 +1268,17 @@ else if (rb->rb_right) rb = rb->rb_right; else { - e = rb_entry(rb, struct ubi_wl_entry, rb); + e = rb_entry(rb, struct ubi_wl_entry, u.rb); rb = rb_parent(rb); if (rb) { - if (rb->rb_left == &e->rb) + if (rb->rb_left == &e->u.rb) rb->rb_left = NULL; else rb->rb_right = NULL; } - kmem_cache_free(wl_entries_slab, e); + kmem_cache_free(ubi_wl_entry_slab, e); } } } @@ -1303,7 +1287,7 @@ * ubi_thread - UBI background thread. * @u: the UBI device description object pointer */ -static int ubi_thread(void *u) +int ubi_thread(void *u) { int failures = 0; struct ubi_device *ubi = u; @@ -1316,7 +1300,7 @@ int err; if (kthread_should_stop()) - goto out; + break; if (try_to_freeze()) continue; @@ -1343,7 +1327,8 @@ ubi_msg("%s: %d consecutive failures", ubi->bgt_name, WL_MAX_FAILURES); ubi_ro_mode(ubi); - break; + ubi->thread_enabled = 0; + continue; } } else failures = 0; @@ -1351,7 +1336,6 @@ cond_resched(); } -out: dbg_wl("background thread \"%s\" is killed", ubi->bgt_name); return 0; } @@ -1374,8 +1358,7 @@ } /** - * ubi_wl_init_scan - initialize the wear-leveling unit using scanning - * information. + * ubi_wl_init_scan - initialize the WL sub-system using scanning information. * @ubi: UBI device description object * @si: scanning information * @@ -1384,46 +1367,34 @@ */ int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si) { - int err; + int err, i; struct rb_node *rb1, *rb2; struct ubi_scan_volume *sv; struct ubi_scan_leb *seb, *tmp; struct ubi_wl_entry *e; - ubi->used = ubi->free = ubi->scrub = RB_ROOT; - ubi->prot.pnum = ubi->prot.aec = RB_ROOT; spin_lock_init(&ubi->wl_lock); + mutex_init(&ubi->move_mutex); + init_rwsem(&ubi->work_sem); ubi->max_ec = si->max_ec; INIT_LIST_HEAD(&ubi->works); sprintf(ubi->bgt_name, UBI_BGT_NAME_PATTERN, ubi->ubi_num); - ubi->bgt_thread = kthread_create(ubi_thread, ubi, ubi->bgt_name); - if (IS_ERR(ubi->bgt_thread)) { - err = PTR_ERR(ubi->bgt_thread); - ubi_err("cannot spawn \"%s\", error %d", ubi->bgt_name, - err); - return err; - } - - if (ubi_devices_cnt == 0) { - wl_entries_slab = kmem_cache_create("ubi_wl_entry_slab", - sizeof(struct ubi_wl_entry), - 0, 0, NULL); - if (!wl_entries_slab) - return -ENOMEM; - } - err = -ENOMEM; ubi->lookuptbl = kzalloc(ubi->peb_count * sizeof(void *), GFP_KERNEL); if (!ubi->lookuptbl) - goto out_free; + return err; + + for (i = 0; i < UBI_PROT_QUEUE_LEN; i++) + INIT_LIST_HEAD(&ubi->pq[i]); + ubi->pq_head = 0; list_for_each_entry_safe(seb, tmp, &si->erase, u.list) { cond_resched(); - e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL); + e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL); if (!e) goto out_free; @@ -1431,7 +1402,7 @@ e->ec = seb->ec; ubi->lookuptbl[e->pnum] = e; if (schedule_erase(ubi, e, 0)) { - kmem_cache_free(wl_entries_slab, e); + kmem_cache_free(ubi_wl_entry_slab, e); goto out_free; } } @@ -1439,7 +1410,7 @@ list_for_each_entry(seb, &si->free, u.list) { cond_resched(); - e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL); + e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL); if (!e) goto out_free; @@ -1453,7 +1424,7 @@ list_for_each_entry(seb, &si->corr, u.list) { cond_resched(); - e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL); + e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL); if (!e) goto out_free; @@ -1461,7 +1432,7 @@ e->ec = seb->ec; ubi->lookuptbl[e->pnum] = e; if (schedule_erase(ubi, e, 0)) { - kmem_cache_free(wl_entries_slab, e); + kmem_cache_free(ubi_wl_entry_slab, e); goto out_free; } } @@ -1470,7 +1441,7 @@ ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb) { cond_resched(); - e = kmem_cache_alloc(wl_entries_slab, GFP_KERNEL); + e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL); if (!e) goto out_free; @@ -1510,70 +1481,45 @@ tree_destroy(&ubi->free); tree_destroy(&ubi->scrub); kfree(ubi->lookuptbl); - if (ubi_devices_cnt == 0) - kmem_cache_destroy(wl_entries_slab); return err; } /** - * protection_trees_destroy - destroy the protection RB-trees. + * protection_queue_destroy - destroy the protection queue. * @ubi: UBI device description object */ -static void protection_trees_destroy(struct ubi_device *ubi) +static void protection_queue_destroy(struct ubi_device *ubi) { - struct rb_node *rb; - struct ubi_wl_prot_entry *pe; - - rb = ubi->prot.aec.rb_node; - while (rb) { - if (rb->rb_left) - rb = rb->rb_left; - else if (rb->rb_right) - rb = rb->rb_right; - else { - pe = rb_entry(rb, struct ubi_wl_prot_entry, rb_aec); - - rb = rb_parent(rb); - if (rb) { - if (rb->rb_left == &pe->rb_aec) - rb->rb_left = NULL; - else - rb->rb_right = NULL; - } + int i; + struct ubi_wl_entry *e, *tmp; - kmem_cache_free(wl_entries_slab, pe->e); - kfree(pe); + for (i = 0; i < UBI_PROT_QUEUE_LEN; ++i) { + list_for_each_entry_safe(e, tmp, &ubi->pq[i], u.list) { + list_del(&e->u.list); + kmem_cache_free(ubi_wl_entry_slab, e); } } } /** - * ubi_wl_close - close the wear-leveling unit. + * ubi_wl_close - close the wear-leveling sub-system. * @ubi: UBI device description object */ void ubi_wl_close(struct ubi_device *ubi) { - dbg_wl("disable \"%s\"", ubi->bgt_name); - if (ubi->bgt_thread) - kthread_stop(ubi->bgt_thread); - - dbg_wl("close the UBI wear-leveling unit"); - + dbg_wl("close the WL sub-system"); cancel_pending(ubi); - protection_trees_destroy(ubi); + protection_queue_destroy(ubi); tree_destroy(&ubi->used); tree_destroy(&ubi->free); tree_destroy(&ubi->scrub); kfree(ubi->lookuptbl); - if (ubi_devices_cnt == 1) - kmem_cache_destroy(wl_entries_slab); } #ifdef CONFIG_MTD_UBI_DEBUG_PARANOID /** - * paranoid_check_ec - make sure that the erase counter of a physical eraseblock - * is correct. + * paranoid_check_ec - make sure that the erase counter of a PEB is correct. * @ubi: UBI device description object * @pnum: the physical eraseblock number to check * @ec: the erase counter to check @@ -1614,13 +1560,12 @@ } /** - * paranoid_check_in_wl_tree - make sure that a wear-leveling entry is present - * in a WL RB-tree. + * paranoid_check_in_wl_tree - check that wear-leveling entry is in WL RB-tree. * @e: the wear-leveling entry to check * @root: the root of the tree * - * This function returns zero if @e is in the @root RB-tree and %1 if it - * is not. + * This function returns zero if @e is in the @root RB-tree and %1 if it is + * not. */ static int paranoid_check_in_wl_tree(struct ubi_wl_entry *e, struct rb_root *root) @@ -1634,4 +1579,27 @@ return 1; } +/** + * paranoid_check_in_pq - check if wear-leveling entry is in the protection + * queue. + * @ubi: UBI device description object + * @e: the wear-leveling entry to check + * + * This function returns zero if @e is in @ubi->pq and %1 if it is not. + */ +static int paranoid_check_in_pq(struct ubi_device *ubi, struct ubi_wl_entry *e) +{ + struct ubi_wl_entry *p; + int i; + + for (i = 0; i < UBI_PROT_QUEUE_LEN; ++i) + list_for_each_entry(p, &ubi->pq[i], u.list) + if (p == e) + return 0; + + ubi_err("paranoid check failed for PEB %d, EC %d, Protect queue", + e->pnum, e->ec); + ubi_dbg_dump_stack(); + return 1; +} #endif /* CONFIG_MTD_UBI_DEBUG_PARANOID */ diff -Nurd linux-2.6.24/fs/Kconfig ubifs-v2.6.24/fs/Kconfig --- linux-2.6.24/fs/Kconfig 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/fs/Kconfig 2009-04-07 17:14:47.000000000 +0200 @@ -1385,6 +1385,9 @@ endchoice +# UBIFS File system configuration +source "fs/ubifs/Kconfig" + config CRAMFS tristate "Compressed ROM file system support (cramfs)" depends on BLOCK diff -Nurd linux-2.6.24/fs/Makefile ubifs-v2.6.24/fs/Makefile --- linux-2.6.24/fs/Makefile 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/fs/Makefile 2009-04-07 17:14:47.000000000 +0200 @@ -99,6 +99,7 @@ obj-$(CONFIG_UFS_FS) += ufs/ obj-$(CONFIG_EFS_FS) += efs/ obj-$(CONFIG_JFFS2_FS) += jffs2/ +obj-$(CONFIG_UBIFS_FS) += ubifs/ obj-$(CONFIG_AFFS_FS) += affs/ obj-$(CONFIG_ROMFS_FS) += romfs/ obj-$(CONFIG_QNX4FS_FS) += qnx4/ diff -Nurd linux-2.6.24/fs/fs-writeback.c ubifs-v2.6.24/fs/fs-writeback.c --- linux-2.6.24/fs/fs-writeback.c 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/fs/fs-writeback.c 2009-04-07 17:14:47.000000000 +0200 @@ -386,8 +386,6 @@ * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so * that it can be located for waiting on in __writeback_single_inode(). * - * Called under inode_lock. - * * If `bdi' is non-zero then we're being asked to writeback a specific queue. * This function assumes that the blockdev superblock's inodes are backed by * a variety of queues, so all inodes are searched. For other superblocks, @@ -403,11 +401,12 @@ * on the writer throttling path, and we get decent balancing between many * throttled threads: we don't want them all piling up on inode_sync_wait. */ -static void -sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) +void generic_sync_sb_inodes(struct super_block *sb, + struct writeback_control *wbc) { const unsigned long start = jiffies; /* livelock avoidance */ + spin_lock(&inode_lock); if (!wbc->for_kupdate || list_empty(&sb->s_io)) queue_io(sb, wbc->older_than_this); @@ -482,8 +481,16 @@ if (wbc->nr_to_write <= 0) break; } + spin_unlock(&inode_lock); return; /* Leave any unwritten inodes on s_io */ } +EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); + +static void sync_sb_inodes(struct super_block *sb, + struct writeback_control *wbc) +{ + generic_sync_sb_inodes(sb, wbc); +} /* * Start writeback of dirty pagecache data against all unlocked inodes. @@ -524,11 +531,8 @@ * be unmounted by the time it is released. */ if (down_read_trylock(&sb->s_umount)) { - if (sb->s_root) { - spin_lock(&inode_lock); + if (sb->s_root) sync_sb_inodes(sb, wbc); - spin_unlock(&inode_lock); - } up_read(&sb->s_umount); } spin_lock(&sb_lock); @@ -566,9 +570,7 @@ (inodes_stat.nr_inodes - inodes_stat.nr_unused) + nr_dirty + nr_unstable; wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */ - spin_lock(&inode_lock); sync_sb_inodes(sb, &wbc); - spin_unlock(&inode_lock); } /* diff -Nurd linux-2.6.24/fs/ubifs/Kconfig ubifs-v2.6.24/fs/ubifs/Kconfig --- linux-2.6.24/fs/ubifs/Kconfig 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/Kconfig 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,72 @@ +config UBIFS_FS + tristate "UBIFS file system support" + select CRC16 + select CRC32 + select CRYPTO if UBIFS_FS_ADVANCED_COMPR + select CRYPTO if UBIFS_FS_LZO + select CRYPTO if UBIFS_FS_ZLIB + select CRYPTO_LZO if UBIFS_FS_LZO + select CRYPTO_DEFLATE if UBIFS_FS_ZLIB + depends on MTD_UBI + help + UBIFS is a file system for flash devices which works on top of UBI. + +config UBIFS_FS_XATTR + bool "Extended attributes support" + depends on UBIFS_FS + help + This option enables support of extended attributes. + +config UBIFS_FS_ADVANCED_COMPR + bool "Advanced compression options" + depends on UBIFS_FS + help + This option allows to explicitly choose which compressions, if any, + are enabled in UBIFS. Removing compressors means inbility to read + existing file systems. + + If unsure, say 'N'. + +config UBIFS_FS_LZO + bool "LZO compression support" if UBIFS_FS_ADVANCED_COMPR + depends on UBIFS_FS + default y + help + LZO compressor is generally faster then zlib but compresses worse. + Say 'Y' if unsure. + +config UBIFS_FS_ZLIB + bool "ZLIB compression support" if UBIFS_FS_ADVANCED_COMPR + depends on UBIFS_FS + default y + help + Zlib copresses better then LZO but it is slower. Say 'Y' if unsure. + +# Debugging-related stuff +config UBIFS_FS_DEBUG + bool "Enable debugging" + depends on UBIFS_FS + select DEBUG_FS + select KALLSYMS_ALL + help + This option enables UBIFS debugging. + +config UBIFS_FS_DEBUG_MSG_LVL + int "Default message level (0 = no extra messages, 3 = lots)" + depends on UBIFS_FS_DEBUG + default "0" + help + This controls the amount of debugging messages produced by UBIFS. + If reporting bugs, please try to have available a full dump of the + messages at level 1 while the misbehaviour was occurring. Level 2 + may become necessary if level 1 messages were not enough to find the + bug. Generally Level 3 should be avoided. + +config UBIFS_FS_DEBUG_CHKS + bool "Enable extra checks" + depends on UBIFS_FS_DEBUG + help + If extra checks are enabled UBIFS will check the consistency of its + internal data structures during operation. However, UBIFS performance + is dramatically slower when this option is selected especially if the + file system is large. diff -Nurd linux-2.6.24/fs/ubifs/Makefile ubifs-v2.6.24/fs/ubifs/Makefile --- linux-2.6.24/fs/ubifs/Makefile 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/Makefile 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,9 @@ +obj-$(CONFIG_UBIFS_FS) += ubifs.o + +ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o +ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o +ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o +ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o + +ubifs-$(CONFIG_UBIFS_FS_DEBUG) += debug.o +ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o diff -Nurd linux-2.6.24/fs/ubifs/budget.c ubifs-v2.6.24/fs/ubifs/budget.c --- linux-2.6.24/fs/ubifs/budget.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/budget.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,755 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements the budgeting sub-system which is responsible for UBIFS + * space management. + * + * Factors such as compression, wasted space at the ends of LEBs, space in other + * journal heads, the effect of updates on the index, and so on, make it + * impossible to accurately predict the amount of space needed. Consequently + * approximations are used. + */ + +#include "ubifs.h" +#include + +/* + * When pessimistic budget calculations say that there is no enough space, + * UBIFS starts writing back dirty inodes and pages, doing garbage collection, + * or committing. The below constant defines maximum number of times UBIFS + * repeats the operations. + */ +#define MAX_MKSPC_RETRIES 3 + +/* + * The below constant defines amount of dirty pages which should be written + * back at when trying to shrink the liability. + */ +#define NR_TO_WRITE 16 + +/** + * shrink_liability - write-back some dirty pages/inodes. + * @c: UBIFS file-system description object + * @nr_to_write: how many dirty pages to write-back + * + * This function shrinks UBIFS liability by means of writing back some amount + * of dirty inodes and their pages. Returns the amount of pages which were + * written back. The returned value does not include dirty inodes which were + * synchronized. + * + * Note, this function synchronizes even VFS inodes which are locked + * (@i_mutex) by the caller of the budgeting function, because write-back does + * not touch @i_mutex. + */ +static int shrink_liability(struct ubifs_info *c, int nr_to_write) +{ + int nr_written; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .range_end = LLONG_MAX, + .nr_to_write = nr_to_write, + }; + + generic_sync_sb_inodes(c->vfs_sb, &wbc); + nr_written = nr_to_write - wbc.nr_to_write; + + if (!nr_written) { + /* + * Re-try again but wait on pages/inodes which are being + * written-back concurrently (e.g., by pdflush). + */ + memset(&wbc, 0, sizeof(struct writeback_control)); + wbc.sync_mode = WB_SYNC_ALL; + wbc.range_end = LLONG_MAX; + wbc.nr_to_write = nr_to_write; + generic_sync_sb_inodes(c->vfs_sb, &wbc); + nr_written = nr_to_write - wbc.nr_to_write; + } + + dbg_budg("%d pages were written back", nr_written); + return nr_written; +} + + +/** + * run_gc - run garbage collector. + * @c: UBIFS file-system description object + * + * This function runs garbage collector to make some more free space. Returns + * zero if a free LEB has been produced, %-EAGAIN if commit is required, and a + * negative error code in case of failure. + */ +static int run_gc(struct ubifs_info *c) +{ + int err, lnum; + + /* Make some free space by garbage-collecting dirty space */ + down_read(&c->commit_sem); + lnum = ubifs_garbage_collect(c, 1); + up_read(&c->commit_sem); + if (lnum < 0) + return lnum; + + /* GC freed one LEB, return it to lprops */ + dbg_budg("GC freed LEB %d", lnum); + err = ubifs_return_leb(c, lnum); + if (err) + return err; + return 0; +} + +/** + * get_liability - calculate current liability. + * @c: UBIFS file-system description object + * + * This function calculates and returns current UBIFS liability, i.e. the + * amount of bytes UBIFS has "promised" to write to the media. + */ +static long long get_liability(struct ubifs_info *c) +{ + long long liab; + + spin_lock(&c->space_lock); + liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth; + spin_unlock(&c->space_lock); + return liab; +} + +/** + * make_free_space - make more free space on the file-system. + * @c: UBIFS file-system description object + * + * This function is called when an operation cannot be budgeted because there + * is supposedly no free space. But in most cases there is some free space: + * o budgeting is pessimistic, so it always budgets more then it is actually + * needed, so shrinking the liability is one way to make free space - the + * cached data will take less space then it was budgeted for; + * o GC may turn some dark space into free space (budgeting treats dark space + * as not available); + * o commit may free some LEB, i.e., turn freeable LEBs into free LEBs. + * + * So this function tries to do the above. Returns %-EAGAIN if some free space + * was presumably made and the caller has to re-try budgeting the operation. + * Returns %-ENOSPC if it couldn't do more free space, and other negative error + * codes on failures. + */ +static int make_free_space(struct ubifs_info *c) +{ + int err, retries = 0; + long long liab1, liab2; + + do { + liab1 = get_liability(c); + /* + * We probably have some dirty pages or inodes (liability), try + * to write them back. + */ + dbg_budg("liability %lld, run write-back", liab1); + shrink_liability(c, NR_TO_WRITE); + + liab2 = get_liability(c); + if (liab2 < liab1) + return -EAGAIN; + + dbg_budg("new liability %lld (not shrinked)", liab2); + + /* Liability did not shrink again, try GC */ + dbg_budg("Run GC"); + err = run_gc(c); + if (!err) + return -EAGAIN; + + if (err != -EAGAIN && err != -ENOSPC) + /* Some real error happened */ + return err; + + dbg_budg("Run commit (retries %d)", retries); + err = ubifs_run_commit(c); + if (err) + return err; + } while (retries++ < MAX_MKSPC_RETRIES); + + return -ENOSPC; +} + +/** + * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index. + * @c: UBIFS file-system description object + * + * This function calculates and returns the number of LEBs which should be kept + * for index usage. + */ +int ubifs_calc_min_idx_lebs(struct ubifs_info *c) +{ + int idx_lebs; + long long idx_size; + + idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; + /* And make sure we have thrice the index size of space reserved */ + idx_size += idx_size << 1; + /* + * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' + * pair, nor similarly the two variables for the new index size, so we + * have to do this costly 64-bit division on fast-path. + */ + idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size); + /* + * The index head is not available for the in-the-gaps method, so add an + * extra LEB to compensate. + */ + idx_lebs += 1; + if (idx_lebs < MIN_INDEX_LEBS) + idx_lebs = MIN_INDEX_LEBS; + return idx_lebs; +} + +/** + * ubifs_calc_available - calculate available FS space. + * @c: UBIFS file-system description object + * @min_idx_lebs: minimum number of LEBs reserved for the index + * + * This function calculates and returns amount of FS space available for use. + */ +long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs) +{ + int subtract_lebs; + long long available; + + available = c->main_bytes - c->lst.total_used; + + /* + * Now 'available' contains theoretically available flash space + * assuming there is no index, so we have to subtract the space which + * is reserved for the index. + */ + subtract_lebs = min_idx_lebs; + + /* Take into account that GC reserves one LEB for its own needs */ + subtract_lebs += 1; + + /* + * The GC journal head LEB is not really accessible. And since + * different write types go to different heads, we may count only on + * one head's space. + */ + subtract_lebs += c->jhead_cnt - 1; + + /* We also reserve one LEB for deletions, which bypass budgeting */ + subtract_lebs += 1; + + available -= (long long)subtract_lebs * c->leb_size; + + /* Subtract the dead space which is not available for use */ + available -= c->lst.total_dead; + + /* + * Subtract dark space, which might or might not be usable - it depends + * on the data which we have on the media and which will be written. If + * this is a lot of uncompressed or not-compressible data, the dark + * space cannot be used. + */ + available -= c->lst.total_dark; + + /* + * However, there is more dark space. The index may be bigger than + * @min_idx_lebs. Those extra LEBs are assumed to be available, but + * their dark space is not included in total_dark, so it is subtracted + * here. + */ + if (c->lst.idx_lebs > min_idx_lebs) { + subtract_lebs = c->lst.idx_lebs - min_idx_lebs; + available -= subtract_lebs * c->dark_wm; + } + + /* The calculations are rough and may end up with a negative number */ + return available > 0 ? available : 0; +} + +/** + * can_use_rp - check whether the user is allowed to use reserved pool. + * @c: UBIFS file-system description object + * + * UBIFS has so-called "reserved pool" which is flash space reserved + * for the superuser and for uses whose UID/GID is recorded in UBIFS superblock. + * This function checks whether current user is allowed to use reserved pool. + * Returns %1 current user is allowed to use reserved pool and %0 otherwise. + */ +static int can_use_rp(struct ubifs_info *c) +{ + if (current->fsuid == c->rp_uid || capable(CAP_SYS_RESOURCE) || + (c->rp_gid != 0 && in_group_p(c->rp_gid))) + return 1; + return 0; +} + +/** + * do_budget_space - reserve flash space for index and data growth. + * @c: UBIFS file-system description object + * + * This function makes sure UBIFS has enough free LEBs for index growth and + * data. + * + * When budgeting index space, UBIFS reserves thrice as many LEBs as the index + * would take if it was consolidated and written to the flash. This guarantees + * that the "in-the-gaps" commit method always succeeds and UBIFS will always + * be able to commit dirty index. So this function basically adds amount of + * budgeted index space to the size of the current index, multiplies this by 3, + * and makes sure this does not exceed the amount of free LEBs. + * + * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: + * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might + * be large, because UBIFS does not do any index consolidation as long as + * there is free space. IOW, the index may take a lot of LEBs, but the LEBs + * will contain a lot of dirt. + * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW, + * the index may be consolidated to take up to @c->min_idx_lebs LEBs. + * + * This function returns zero in case of success, and %-ENOSPC in case of + * failure. + */ +static int do_budget_space(struct ubifs_info *c) +{ + long long outstanding, available; + int lebs, rsvd_idx_lebs, min_idx_lebs; + + /* First budget index space */ + min_idx_lebs = ubifs_calc_min_idx_lebs(c); + + /* Now 'min_idx_lebs' contains number of LEBs to reserve */ + if (min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; + else + rsvd_idx_lebs = 0; + + /* + * The number of LEBs that are available to be used by the index is: + * + * @c->lst.empty_lebs + @c->freeable_cnt + @c->idx_gc_cnt - + * @c->lst.taken_empty_lebs + * + * @c->lst.empty_lebs are available because they are empty. + * @c->freeable_cnt are available because they contain only free and + * dirty space, @c->idx_gc_cnt are available because they are index + * LEBs that have been garbage collected and are awaiting the commit + * before they can be used. And the in-the-gaps method will grab these + * if it needs them. @c->lst.taken_empty_lebs are empty LEBs that have + * already been allocated for some purpose. + * + * Note, @c->idx_gc_cnt is included to both @c->lst.empty_lebs (because + * these LEBs are empty) and to @c->lst.taken_empty_lebs (because they + * are taken until after the commit). + * + * Note, @c->lst.taken_empty_lebs may temporarily be higher by one + * because of the way we serialize LEB allocations and budgeting. See a + * comment in 'ubifs_find_free_space()'. + */ + lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - + c->lst.taken_empty_lebs; + if (unlikely(rsvd_idx_lebs > lebs)) { + dbg_budg("out of indexing space: min_idx_lebs %d (old %d), " + "rsvd_idx_lebs %d", min_idx_lebs, c->min_idx_lebs, + rsvd_idx_lebs); + return -ENOSPC; + } + + available = ubifs_calc_available(c, min_idx_lebs); + outstanding = c->budg_data_growth + c->budg_dd_growth; + + if (unlikely(available < outstanding)) { + dbg_budg("out of data space: available %lld, outstanding %lld", + available, outstanding); + return -ENOSPC; + } + + if (available - outstanding <= c->rp_size && !can_use_rp(c)) + return -ENOSPC; + + c->min_idx_lebs = min_idx_lebs; + return 0; +} + +/** + * calc_idx_growth - calculate approximate index growth from budgeting request. + * @c: UBIFS file-system description object + * @req: budgeting request + * + * For now we assume each new node adds one znode. But this is rather poor + * approximation, though. + */ +static int calc_idx_growth(const struct ubifs_info *c, + const struct ubifs_budget_req *req) +{ + int znodes; + + znodes = req->new_ino + (req->new_page << UBIFS_BLOCKS_PER_PAGE_SHIFT) + + req->new_dent; + return znodes * c->max_idx_node_sz; +} + +/** + * calc_data_growth - calculate approximate amount of new data from budgeting + * request. + * @c: UBIFS file-system description object + * @req: budgeting request + */ +static int calc_data_growth(const struct ubifs_info *c, + const struct ubifs_budget_req *req) +{ + int data_growth; + + data_growth = req->new_ino ? c->inode_budget : 0; + if (req->new_page) + data_growth += c->page_budget; + if (req->new_dent) + data_growth += c->dent_budget; + data_growth += req->new_ino_d; + return data_growth; +} + +/** + * calc_dd_growth - calculate approximate amount of data which makes other data + * dirty from budgeting request. + * @c: UBIFS file-system description object + * @req: budgeting request + */ +static int calc_dd_growth(const struct ubifs_info *c, + const struct ubifs_budget_req *req) +{ + int dd_growth; + + dd_growth = req->dirtied_page ? c->page_budget : 0; + + if (req->dirtied_ino) + dd_growth += c->inode_budget << (req->dirtied_ino - 1); + if (req->mod_dent) + dd_growth += c->dent_budget; + dd_growth += req->dirtied_ino_d; + return dd_growth; +} + +/** + * ubifs_budget_space - ensure there is enough space to complete an operation. + * @c: UBIFS file-system description object + * @req: budget request + * + * This function allocates budget for an operation. It uses pessimistic + * approximation of how much flash space the operation needs. The goal of this + * function is to make sure UBIFS always has flash space to flush all dirty + * pages, dirty inodes, and dirty znodes (liability). This function may force + * commit, garbage-collection or write-back. Returns zero in case of success, + * %-ENOSPC if there is no free space and other negative error codes in case of + * failures. + */ +int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req) +{ + int uninitialized_var(cmt_retries), uninitialized_var(wb_retries); + int err, idx_growth, data_growth, dd_growth, retried = 0; + + ubifs_assert(req->new_page <= 1); + ubifs_assert(req->dirtied_page <= 1); + ubifs_assert(req->new_dent <= 1); + ubifs_assert(req->mod_dent <= 1); + ubifs_assert(req->new_ino <= 1); + ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA); + ubifs_assert(req->dirtied_ino <= 4); + ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); + ubifs_assert(!(req->new_ino_d & 7)); + ubifs_assert(!(req->dirtied_ino_d & 7)); + + data_growth = calc_data_growth(c, req); + dd_growth = calc_dd_growth(c, req); + if (!data_growth && !dd_growth) + return 0; + idx_growth = calc_idx_growth(c, req); + +again: + spin_lock(&c->space_lock); + ubifs_assert(c->budg_idx_growth >= 0); + ubifs_assert(c->budg_data_growth >= 0); + ubifs_assert(c->budg_dd_growth >= 0); + + if (unlikely(c->nospace) && (c->nospace_rp || !can_use_rp(c))) { + dbg_budg("no space"); + spin_unlock(&c->space_lock); + return -ENOSPC; + } + + c->budg_idx_growth += idx_growth; + c->budg_data_growth += data_growth; + c->budg_dd_growth += dd_growth; + + err = do_budget_space(c); + if (likely(!err)) { + req->idx_growth = idx_growth; + req->data_growth = data_growth; + req->dd_growth = dd_growth; + spin_unlock(&c->space_lock); + return 0; + } + + /* Restore the old values */ + c->budg_idx_growth -= idx_growth; + c->budg_data_growth -= data_growth; + c->budg_dd_growth -= dd_growth; + spin_unlock(&c->space_lock); + + if (req->fast) { + dbg_budg("no space for fast budgeting"); + return err; + } + + err = make_free_space(c); + cond_resched(); + if (err == -EAGAIN) { + dbg_budg("try again"); + goto again; + } else if (err == -ENOSPC) { + if (!retried) { + retried = 1; + dbg_budg("-ENOSPC, but anyway try once again"); + goto again; + } + dbg_budg("FS is full, -ENOSPC"); + c->nospace = 1; + if (can_use_rp(c) || c->rp_size == 0) + c->nospace_rp = 1; + smp_wmb(); + } else + ubifs_err("cannot budget space, error %d", err); + return err; +} + +/** + * ubifs_release_budget - release budgeted free space. + * @c: UBIFS file-system description object + * @req: budget request + * + * This function releases the space budgeted by 'ubifs_budget_space()'. Note, + * since the index changes (which were budgeted for in @req->idx_growth) will + * only be written to the media on commit, this function moves the index budget + * from @c->budg_idx_growth to @c->budg_uncommitted_idx. The latter will be + * zeroed by the commit operation. + */ +void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) +{ + ubifs_assert(req->new_page <= 1); + ubifs_assert(req->dirtied_page <= 1); + ubifs_assert(req->new_dent <= 1); + ubifs_assert(req->mod_dent <= 1); + ubifs_assert(req->new_ino <= 1); + ubifs_assert(req->new_ino_d <= UBIFS_MAX_INO_DATA); + ubifs_assert(req->dirtied_ino <= 4); + ubifs_assert(req->dirtied_ino_d <= UBIFS_MAX_INO_DATA * 4); + ubifs_assert(!(req->new_ino_d & 7)); + ubifs_assert(!(req->dirtied_ino_d & 7)); + if (!req->recalculate) { + ubifs_assert(req->idx_growth >= 0); + ubifs_assert(req->data_growth >= 0); + ubifs_assert(req->dd_growth >= 0); + } + + if (req->recalculate) { + req->data_growth = calc_data_growth(c, req); + req->dd_growth = calc_dd_growth(c, req); + req->idx_growth = calc_idx_growth(c, req); + } + + if (!req->data_growth && !req->dd_growth) + return; + + c->nospace = c->nospace_rp = 0; + smp_wmb(); + + spin_lock(&c->space_lock); + c->budg_idx_growth -= req->idx_growth; + c->budg_uncommitted_idx += req->idx_growth; + c->budg_data_growth -= req->data_growth; + c->budg_dd_growth -= req->dd_growth; + c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + + ubifs_assert(c->budg_idx_growth >= 0); + ubifs_assert(c->budg_data_growth >= 0); + ubifs_assert(c->budg_dd_growth >= 0); + ubifs_assert(c->min_idx_lebs < c->main_lebs); + ubifs_assert(!(c->budg_idx_growth & 7)); + ubifs_assert(!(c->budg_data_growth & 7)); + ubifs_assert(!(c->budg_dd_growth & 7)); + spin_unlock(&c->space_lock); +} + +/** + * ubifs_convert_page_budget - convert budget of a new page. + * @c: UBIFS file-system description object + * + * This function converts budget which was allocated for a new page of data to + * the budget of changing an existing page of data. The latter is smaller then + * the former, so this function only does simple re-calculation and does not + * involve any write-back. + */ +void ubifs_convert_page_budget(struct ubifs_info *c) +{ + spin_lock(&c->space_lock); + /* Release the index growth reservation */ + c->budg_idx_growth -= c->max_idx_node_sz << UBIFS_BLOCKS_PER_PAGE_SHIFT; + /* Release the data growth reservation */ + c->budg_data_growth -= c->page_budget; + /* Increase the dirty data growth reservation instead */ + c->budg_dd_growth += c->page_budget; + /* And re-calculate the indexing space reservation */ + c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + spin_unlock(&c->space_lock); +} + +/** + * ubifs_release_dirty_inode_budget - release dirty inode budget. + * @c: UBIFS file-system description object + * @ui: UBIFS inode to release the budget for + * + * This function releases budget corresponding to a dirty inode. It is usually + * called when after the inode has been written to the media and marked as + * clean. + */ +void ubifs_release_dirty_inode_budget(struct ubifs_info *c, + struct ubifs_inode *ui) +{ + struct ubifs_budget_req req; + + memset(&req, 0, sizeof(struct ubifs_budget_req)); + req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8); + ubifs_release_budget(c, &req); +} + +/** + * ubifs_reported_space - calculate reported free space. + * @c: the UBIFS file-system description object + * @free: amount of free space + * + * This function calculates amount of free space which will be reported to + * user-space. User-space application tend to expect that if the file-system + * (e.g., via the 'statfs()' call) reports that it has N bytes available, they + * are able to write a file of size N. UBIFS attaches node headers to each data + * node and it has to write indexing nodes as well. This introduces additional + * overhead, and UBIFS has to report slightly less free space to meet the above + * expectations. + * + * This function assumes free space is made up of uncompressed data nodes and + * full index nodes (one per data node, tripled because we always allow enough + * space to write the index thrice). + * + * Note, the calculation is pessimistic, which means that most of the time + * UBIFS reports less space than it actually has. + */ +long long ubifs_reported_space(const struct ubifs_info *c, long long free) +{ + int divisor, factor, f; + + /* + * Reported space size is @free * X, where X is UBIFS block size + * divided by UBIFS block size + all overhead one data block + * introduces. The overhead is the node header + indexing overhead. + * + * Indexing overhead calculations are based on the following formula: + * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number + * of data nodes, f - fanout. Because effective UBIFS fanout is twice + * as less than maximum fanout, we assume that each data node + * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes. + * Note, the multiplier 3 is because UBIFS reserves thrice as more space + * for the index. + */ + f = c->fanout > 3 ? c->fanout >> 1 : 2; + factor = UBIFS_BLOCK_SIZE; + divisor = UBIFS_MAX_DATA_NODE_SZ; + divisor += (c->max_idx_node_sz * 3) / (f - 1); + free *= factor; + return div_u64(free, divisor); +} + +/** + * ubifs_get_free_space_nolock - return amount of free space. + * @c: UBIFS file-system description object + * + * This function calculates amount of free space to report to user-space. + * + * Because UBIFS may introduce substantial overhead (the index, node headers, + * alignment, wastage at the end of LEBs, etc), it cannot report real amount of + * free flash space it has (well, because not all dirty space is reclaimable, + * UBIFS does not actually know the real amount). If UBIFS did so, it would + * bread user expectations about what free space is. Users seem to accustomed + * to assume that if the file-system reports N bytes of free space, they would + * be able to fit a file of N bytes to the FS. This almost works for + * traditional file-systems, because they have way less overhead than UBIFS. + * So, to keep users happy, UBIFS tries to take the overhead into account. + */ +long long ubifs_get_free_space_nolock(struct ubifs_info *c) +{ + int rsvd_idx_lebs, lebs; + long long available, outstanding, free; + + ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c)); + outstanding = c->budg_data_growth + c->budg_dd_growth; + available = ubifs_calc_available(c, c->min_idx_lebs); + + /* + * When reporting free space to user-space, UBIFS guarantees that it is + * possible to write a file of free space size. This means that for + * empty LEBs we may use more precise calculations than + * 'ubifs_calc_available()' is using. Namely, we know that in empty + * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm. + * Thus, amend the available space. + * + * Note, the calculations below are similar to what we have in + * 'do_budget_space()', so refer there for comments. + */ + if (c->min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; + else + rsvd_idx_lebs = 0; + lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - + c->lst.taken_empty_lebs; + lebs -= rsvd_idx_lebs; + available += lebs * (c->dark_wm - c->leb_overhead); + + if (available > outstanding) + free = ubifs_reported_space(c, available - outstanding); + else + free = 0; + return free; +} + +/** + * ubifs_get_free_space - return amount of free space. + * @c: UBIFS file-system description object + * + * This function calculates and retuns amount of free space to report to + * user-space. + */ +long long ubifs_get_free_space(struct ubifs_info *c) +{ + long long free; + + spin_lock(&c->space_lock); + free = ubifs_get_free_space_nolock(c); + spin_unlock(&c->space_lock); + + return free; +} diff -Nurd linux-2.6.24/fs/ubifs/commit.c ubifs-v2.6.24/fs/ubifs/commit.c --- linux-2.6.24/fs/ubifs/commit.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/commit.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,679 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements functions that manage the running of the commit process. + * Each affected module has its own functions to accomplish their part in the + * commit and those functions are called here. + * + * The commit is the process whereby all updates to the index and LEB properties + * are written out together and the journal becomes empty. This keeps the + * file system consistent - at all times the state can be recreated by reading + * the index and LEB properties and then replaying the journal. + * + * The commit is split into two parts named "commit start" and "commit end". + * During commit start, the commit process has exclusive access to the journal + * by holding the commit semaphore down for writing. As few I/O operations as + * possible are performed during commit start, instead the nodes that are to be + * written are merely identified. During commit end, the commit semaphore is no + * longer held and the journal is again in operation, allowing users to continue + * to use the file system while the bulk of the commit I/O is performed. The + * purpose of this two-step approach is to prevent the commit from causing any + * latency blips. Note that in any case, the commit does not prevent lookups + * (as permitted by the TNC mutex), or access to VFS data structures e.g. page + * cache. + */ + +#include +#include +#include "ubifs.h" + +/** + * do_commit - commit the journal. + * @c: UBIFS file-system description object + * + * This function implements UBIFS commit. It has to be called with commit lock + * locked. Returns zero in case of success and a negative error code in case of + * failure. + */ +static int do_commit(struct ubifs_info *c) +{ + int err, new_ltail_lnum, old_ltail_lnum, i; + struct ubifs_zbranch zroot; + struct ubifs_lp_stats lst; + + dbg_cmt("start"); + if (c->ro_media) { + err = -EROFS; + goto out_up; + } + + /* Sync all write buffers (necessary for recovery) */ + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + goto out_up; + } + + c->cmt_no += 1; + err = ubifs_gc_start_commit(c); + if (err) + goto out_up; + err = dbg_check_lprops(c); + if (err) + goto out_up; + err = ubifs_log_start_commit(c, &new_ltail_lnum); + if (err) + goto out_up; + err = ubifs_tnc_start_commit(c, &zroot); + if (err) + goto out_up; + err = ubifs_lpt_start_commit(c); + if (err) + goto out_up; + err = ubifs_orphan_start_commit(c); + if (err) + goto out_up; + + ubifs_get_lp_stats(c, &lst); + + up_write(&c->commit_sem); + + err = ubifs_tnc_end_commit(c); + if (err) + goto out; + err = ubifs_lpt_end_commit(c); + if (err) + goto out; + err = ubifs_orphan_end_commit(c); + if (err) + goto out; + old_ltail_lnum = c->ltail_lnum; + err = ubifs_log_end_commit(c, new_ltail_lnum); + if (err) + goto out; + err = dbg_check_old_index(c, &zroot); + if (err) + goto out; + + mutex_lock(&c->mst_mutex); + c->mst_node->cmt_no = cpu_to_le64(c->cmt_no); + c->mst_node->log_lnum = cpu_to_le32(new_ltail_lnum); + c->mst_node->root_lnum = cpu_to_le32(zroot.lnum); + c->mst_node->root_offs = cpu_to_le32(zroot.offs); + c->mst_node->root_len = cpu_to_le32(zroot.len); + c->mst_node->ihead_lnum = cpu_to_le32(c->ihead_lnum); + c->mst_node->ihead_offs = cpu_to_le32(c->ihead_offs); + c->mst_node->index_size = cpu_to_le64(c->old_idx_sz); + c->mst_node->lpt_lnum = cpu_to_le32(c->lpt_lnum); + c->mst_node->lpt_offs = cpu_to_le32(c->lpt_offs); + c->mst_node->nhead_lnum = cpu_to_le32(c->nhead_lnum); + c->mst_node->nhead_offs = cpu_to_le32(c->nhead_offs); + c->mst_node->ltab_lnum = cpu_to_le32(c->ltab_lnum); + c->mst_node->ltab_offs = cpu_to_le32(c->ltab_offs); + c->mst_node->lsave_lnum = cpu_to_le32(c->lsave_lnum); + c->mst_node->lsave_offs = cpu_to_le32(c->lsave_offs); + c->mst_node->lscan_lnum = cpu_to_le32(c->lscan_lnum); + c->mst_node->empty_lebs = cpu_to_le32(lst.empty_lebs); + c->mst_node->idx_lebs = cpu_to_le32(lst.idx_lebs); + c->mst_node->total_free = cpu_to_le64(lst.total_free); + c->mst_node->total_dirty = cpu_to_le64(lst.total_dirty); + c->mst_node->total_used = cpu_to_le64(lst.total_used); + c->mst_node->total_dead = cpu_to_le64(lst.total_dead); + c->mst_node->total_dark = cpu_to_le64(lst.total_dark); + if (c->no_orphs) + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); + else + c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS); + err = ubifs_write_master(c); + mutex_unlock(&c->mst_mutex); + if (err) + goto out; + + err = ubifs_log_post_commit(c, old_ltail_lnum); + if (err) + goto out; + err = ubifs_gc_end_commit(c); + if (err) + goto out; + err = ubifs_lpt_post_commit(c); + if (err) + goto out; + + spin_lock(&c->cs_lock); + c->cmt_state = COMMIT_RESTING; + wake_up(&c->cmt_wq); + dbg_cmt("commit end"); + spin_unlock(&c->cs_lock); + + return 0; + +out_up: + up_write(&c->commit_sem); +out: + ubifs_err("commit failed, error %d", err); + spin_lock(&c->cs_lock); + c->cmt_state = COMMIT_BROKEN; + wake_up(&c->cmt_wq); + spin_unlock(&c->cs_lock); + ubifs_ro_mode(c, err); + return err; +} + +/** + * run_bg_commit - run background commit if it is needed. + * @c: UBIFS file-system description object + * + * This function runs background commit if it is needed. Returns zero in case + * of success and a negative error code in case of failure. + */ +static int run_bg_commit(struct ubifs_info *c) +{ + spin_lock(&c->cs_lock); + /* + * Run background commit only if background commit was requested or if + * commit is required. + */ + if (c->cmt_state != COMMIT_BACKGROUND && + c->cmt_state != COMMIT_REQUIRED) + goto out; + spin_unlock(&c->cs_lock); + + down_write(&c->commit_sem); + spin_lock(&c->cs_lock); + if (c->cmt_state == COMMIT_REQUIRED) + c->cmt_state = COMMIT_RUNNING_REQUIRED; + else if (c->cmt_state == COMMIT_BACKGROUND) + c->cmt_state = COMMIT_RUNNING_BACKGROUND; + else + goto out_cmt_unlock; + spin_unlock(&c->cs_lock); + + return do_commit(c); + +out_cmt_unlock: + up_write(&c->commit_sem); +out: + spin_unlock(&c->cs_lock); + return 0; +} + +/** + * ubifs_bg_thread - UBIFS background thread function. + * @info: points to the file-system description object + * + * This function implements various file-system background activities: + * o when a write-buffer timer expires it synchronizes the appropriate + * write-buffer; + * o when the journal is about to be full, it starts in-advance commit. + * + * Note, other stuff like background garbage collection may be added here in + * future. + */ +int ubifs_bg_thread(void *info) +{ + int err; + struct ubifs_info *c = info; + + dbg_msg("background thread \"%s\" started, PID %d", + c->bgt_name, current->pid); + set_freezable(); + + while (1) { + if (kthread_should_stop()) + break; + + if (try_to_freeze()) + continue; + + set_current_state(TASK_INTERRUPTIBLE); + /* Check if there is something to do */ + if (!c->need_bgt) { + /* + * Nothing prevents us from going sleep now and + * be never woken up and block the task which + * could wait in 'kthread_stop()' forever. + */ + if (kthread_should_stop()) + break; + schedule(); + continue; + } else + __set_current_state(TASK_RUNNING); + + c->need_bgt = 0; + err = ubifs_bg_wbufs_sync(c); + if (err) + ubifs_ro_mode(c, err); + + run_bg_commit(c); + cond_resched(); + } + + dbg_msg("background thread \"%s\" stops", c->bgt_name); + return 0; +} + +/** + * ubifs_commit_required - set commit state to "required". + * @c: UBIFS file-system description object + * + * This function is called if a commit is required but cannot be done from the + * calling function, so it is just flagged instead. + */ +void ubifs_commit_required(struct ubifs_info *c) +{ + spin_lock(&c->cs_lock); + switch (c->cmt_state) { + case COMMIT_RESTING: + case COMMIT_BACKGROUND: + dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state), + dbg_cstate(COMMIT_REQUIRED)); + c->cmt_state = COMMIT_REQUIRED; + break; + case COMMIT_RUNNING_BACKGROUND: + dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state), + dbg_cstate(COMMIT_RUNNING_REQUIRED)); + c->cmt_state = COMMIT_RUNNING_REQUIRED; + break; + case COMMIT_REQUIRED: + case COMMIT_RUNNING_REQUIRED: + case COMMIT_BROKEN: + break; + } + spin_unlock(&c->cs_lock); +} + +/** + * ubifs_request_bg_commit - notify the background thread to do a commit. + * @c: UBIFS file-system description object + * + * This function is called if the journal is full enough to make a commit + * worthwhile, so background thread is kicked to start it. + */ +void ubifs_request_bg_commit(struct ubifs_info *c) +{ + spin_lock(&c->cs_lock); + if (c->cmt_state == COMMIT_RESTING) { + dbg_cmt("old: %s, new: %s", dbg_cstate(c->cmt_state), + dbg_cstate(COMMIT_BACKGROUND)); + c->cmt_state = COMMIT_BACKGROUND; + spin_unlock(&c->cs_lock); + ubifs_wake_up_bgt(c); + } else + spin_unlock(&c->cs_lock); +} + +/** + * wait_for_commit - wait for commit. + * @c: UBIFS file-system description object + * + * This function sleeps until the commit operation is no longer running. + */ +static int wait_for_commit(struct ubifs_info *c) +{ + dbg_cmt("pid %d goes sleep", current->pid); + + /* + * The following sleeps if the condition is false, and will be woken + * when the commit ends. It is possible, although very unlikely, that we + * will wake up and see the subsequent commit running, rather than the + * one we were waiting for, and go back to sleep. However, we will be + * woken again, so there is no danger of sleeping forever. + */ + wait_event(c->cmt_wq, c->cmt_state != COMMIT_RUNNING_BACKGROUND && + c->cmt_state != COMMIT_RUNNING_REQUIRED); + dbg_cmt("commit finished, pid %d woke up", current->pid); + return 0; +} + +/** + * ubifs_run_commit - run or wait for commit. + * @c: UBIFS file-system description object + * + * This function runs commit and returns zero in case of success and a negative + * error code in case of failure. + */ +int ubifs_run_commit(struct ubifs_info *c) +{ + int err = 0; + + spin_lock(&c->cs_lock); + if (c->cmt_state == COMMIT_BROKEN) { + err = -EINVAL; + goto out; + } + + if (c->cmt_state == COMMIT_RUNNING_BACKGROUND) + /* + * We set the commit state to 'running required' to indicate + * that we want it to complete as quickly as possible. + */ + c->cmt_state = COMMIT_RUNNING_REQUIRED; + + if (c->cmt_state == COMMIT_RUNNING_REQUIRED) { + spin_unlock(&c->cs_lock); + return wait_for_commit(c); + } + spin_unlock(&c->cs_lock); + + /* Ok, the commit is indeed needed */ + + down_write(&c->commit_sem); + spin_lock(&c->cs_lock); + /* + * Since we unlocked 'c->cs_lock', the state may have changed, so + * re-check it. + */ + if (c->cmt_state == COMMIT_BROKEN) { + err = -EINVAL; + goto out_cmt_unlock; + } + + if (c->cmt_state == COMMIT_RUNNING_BACKGROUND) + c->cmt_state = COMMIT_RUNNING_REQUIRED; + + if (c->cmt_state == COMMIT_RUNNING_REQUIRED) { + up_write(&c->commit_sem); + spin_unlock(&c->cs_lock); + return wait_for_commit(c); + } + c->cmt_state = COMMIT_RUNNING_REQUIRED; + spin_unlock(&c->cs_lock); + + err = do_commit(c); + return err; + +out_cmt_unlock: + up_write(&c->commit_sem); +out: + spin_unlock(&c->cs_lock); + return err; +} + +/** + * ubifs_gc_should_commit - determine if it is time for GC to run commit. + * @c: UBIFS file-system description object + * + * This function is called by garbage collection to determine if commit should + * be run. If commit state is @COMMIT_BACKGROUND, which means that the journal + * is full enough to start commit, this function returns true. It is not + * absolutely necessary to commit yet, but it feels like this should be better + * then to keep doing GC. This function returns %1 if GC has to initiate commit + * and %0 if not. + */ +int ubifs_gc_should_commit(struct ubifs_info *c) +{ + int ret = 0; + + spin_lock(&c->cs_lock); + if (c->cmt_state == COMMIT_BACKGROUND) { + dbg_cmt("commit required now"); + c->cmt_state = COMMIT_REQUIRED; + } else + dbg_cmt("commit not requested"); + if (c->cmt_state == COMMIT_REQUIRED) + ret = 1; + spin_unlock(&c->cs_lock); + return ret; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +/** + * struct idx_node - hold index nodes during index tree traversal. + * @list: list + * @iip: index in parent (slot number of this indexing node in the parent + * indexing node) + * @upper_key: all keys in this indexing node have to be less or equivalent to + * this key + * @idx: index node (8-byte aligned because all node structures must be 8-byte + * aligned) + */ +struct idx_node { + struct list_head list; + int iip; + union ubifs_key upper_key; + struct ubifs_idx_node idx __attribute__((aligned(8))); +}; + +/** + * dbg_old_index_check_init - get information for the next old index check. + * @c: UBIFS file-system description object + * @zroot: root of the index + * + * This function records information about the index that will be needed for the + * next old index check i.e. 'dbg_check_old_index()'. + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot) +{ + struct ubifs_idx_node *idx; + int lnum, offs, len, err = 0; + struct ubifs_debug_info *d = c->dbg; + + d->old_zroot = *zroot; + lnum = d->old_zroot.lnum; + offs = d->old_zroot.offs; + len = d->old_zroot.len; + + idx = kmalloc(c->max_idx_node_sz, GFP_NOFS); + if (!idx) + return -ENOMEM; + + err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs); + if (err) + goto out; + + d->old_zroot_level = le16_to_cpu(idx->level); + d->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum); +out: + kfree(idx); + return err; +} + +/** + * dbg_check_old_index - check the old copy of the index. + * @c: UBIFS file-system description object + * @zroot: root of the new index + * + * In order to be able to recover from an unclean unmount, a complete copy of + * the index must exist on flash. This is the "old" index. The commit process + * must write the "new" index to flash without overwriting or destroying any + * part of the old index. This function is run at commit end in order to check + * that the old index does indeed exist completely intact. + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot) +{ + int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt; + int first = 1, iip; + struct ubifs_debug_info *d = c->dbg; + union ubifs_key lower_key, upper_key, l_key, u_key; + unsigned long long uninitialized_var(last_sqnum); + struct ubifs_idx_node *idx; + struct list_head list; + struct idx_node *i; + size_t sz; + + if (!(ubifs_chk_flags & UBIFS_CHK_OLD_IDX)) + goto out; + + INIT_LIST_HEAD(&list); + + sz = sizeof(struct idx_node) + ubifs_idx_node_sz(c, c->fanout) - + UBIFS_IDX_NODE_SZ; + + /* Start at the old zroot */ + lnum = d->old_zroot.lnum; + offs = d->old_zroot.offs; + len = d->old_zroot.len; + iip = 0; + + /* + * Traverse the index tree preorder depth-first i.e. do a node and then + * its subtrees from left to right. + */ + while (1) { + struct ubifs_branch *br; + + /* Get the next index node */ + i = kmalloc(sz, GFP_NOFS); + if (!i) { + err = -ENOMEM; + goto out_free; + } + i->iip = iip; + /* Keep the index nodes on our path in a linked list */ + list_add_tail(&i->list, &list); + /* Read the index node */ + idx = &i->idx; + err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs); + if (err) + goto out_free; + /* Validate index node */ + child_cnt = le16_to_cpu(idx->child_cnt); + if (child_cnt < 1 || child_cnt > c->fanout) { + err = 1; + goto out_dump; + } + if (first) { + first = 0; + /* Check root level and sqnum */ + if (le16_to_cpu(idx->level) != d->old_zroot_level) { + err = 2; + goto out_dump; + } + if (le64_to_cpu(idx->ch.sqnum) != d->old_zroot_sqnum) { + err = 3; + goto out_dump; + } + /* Set last values as though root had a parent */ + last_level = le16_to_cpu(idx->level) + 1; + last_sqnum = le64_to_cpu(idx->ch.sqnum) + 1; + key_read(c, ubifs_idx_key(c, idx), &lower_key); + highest_ino_key(c, &upper_key, INUM_WATERMARK); + } + key_copy(c, &upper_key, &i->upper_key); + if (le16_to_cpu(idx->level) != last_level - 1) { + err = 3; + goto out_dump; + } + /* + * The index is always written bottom up hence a child's sqnum + * is always less than the parents. + */ + if (le64_to_cpu(idx->ch.sqnum) >= last_sqnum) { + err = 4; + goto out_dump; + } + /* Check key range */ + key_read(c, ubifs_idx_key(c, idx), &l_key); + br = ubifs_idx_branch(c, idx, child_cnt - 1); + key_read(c, &br->key, &u_key); + if (keys_cmp(c, &lower_key, &l_key) > 0) { + err = 5; + goto out_dump; + } + if (keys_cmp(c, &upper_key, &u_key) < 0) { + err = 6; + goto out_dump; + } + if (keys_cmp(c, &upper_key, &u_key) == 0) + if (!is_hash_key(c, &u_key)) { + err = 7; + goto out_dump; + } + /* Go to next index node */ + if (le16_to_cpu(idx->level) == 0) { + /* At the bottom, so go up until can go right */ + while (1) { + /* Drop the bottom of the list */ + list_del(&i->list); + kfree(i); + /* No more list means we are done */ + if (list_empty(&list)) + goto out; + /* Look at the new bottom */ + i = list_entry(list.prev, struct idx_node, + list); + idx = &i->idx; + /* Can we go right */ + if (iip + 1 < le16_to_cpu(idx->child_cnt)) { + iip = iip + 1; + break; + } else + /* Nope, so go up again */ + iip = i->iip; + } + } else + /* Go down left */ + iip = 0; + /* + * We have the parent in 'idx' and now we set up for reading the + * child pointed to by slot 'iip'. + */ + last_level = le16_to_cpu(idx->level); + last_sqnum = le64_to_cpu(idx->ch.sqnum); + br = ubifs_idx_branch(c, idx, iip); + lnum = le32_to_cpu(br->lnum); + offs = le32_to_cpu(br->offs); + len = le32_to_cpu(br->len); + key_read(c, &br->key, &lower_key); + if (iip + 1 < le16_to_cpu(idx->child_cnt)) { + br = ubifs_idx_branch(c, idx, iip + 1); + key_read(c, &br->key, &upper_key); + } else + key_copy(c, &i->upper_key, &upper_key); + } +out: + err = dbg_old_index_check_init(c, zroot); + if (err) + goto out_free; + + return 0; + +out_dump: + dbg_err("dumping index node (iip=%d)", i->iip); + dbg_dump_node(c, idx); + list_del(&i->list); + kfree(i); + if (!list_empty(&list)) { + i = list_entry(list.prev, struct idx_node, list); + dbg_err("dumping parent index node"); + dbg_dump_node(c, &i->idx); + } +out_free: + while (!list_empty(&list)) { + i = list_entry(list.next, struct idx_node, list); + list_del(&i->list); + kfree(i); + } + ubifs_err("failed, error %d", err); + if (err > 0) + err = -EINVAL; + return err; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff -Nurd linux-2.6.24/fs/ubifs/compress.c ubifs-v2.6.24/fs/ubifs/compress.c --- linux-2.6.24/fs/ubifs/compress.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/compress.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,251 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * Copyright (C) 2006, 2007 University of Szeged, Hungary + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + * Zoltan Sogor + */ + +/* + * This file provides a single place to access to compression and + * decompression. + */ + +#include +#include "ubifs.h" + +/* Fake description object for the "none" compressor */ +static struct ubifs_compressor none_compr = { + .compr_type = UBIFS_COMPR_NONE, + .name = "none", + .capi_name = "", +}; + +#ifdef CONFIG_UBIFS_FS_LZO +static DEFINE_MUTEX(lzo_mutex); + +static struct ubifs_compressor lzo_compr = { + .compr_type = UBIFS_COMPR_LZO, + .comp_mutex = &lzo_mutex, + .name = "lzo", + .capi_name = "lzo", +}; +#else +static struct ubifs_compressor lzo_compr = { + .compr_type = UBIFS_COMPR_LZO, + .name = "lzo", +}; +#endif + +#ifdef CONFIG_UBIFS_FS_ZLIB +static DEFINE_MUTEX(deflate_mutex); +static DEFINE_MUTEX(inflate_mutex); + +static struct ubifs_compressor zlib_compr = { + .compr_type = UBIFS_COMPR_ZLIB, + .comp_mutex = &deflate_mutex, + .decomp_mutex = &inflate_mutex, + .name = "zlib", + .capi_name = "deflate", +}; +#else +static struct ubifs_compressor zlib_compr = { + .compr_type = UBIFS_COMPR_ZLIB, + .name = "zlib", +}; +#endif + +/* All UBIFS compressors */ +struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; + +/** + * ubifs_compress - compress data. + * @in_buf: data to compress + * @in_len: length of the data to compress + * @out_buf: output buffer where compressed data should be stored + * @out_len: output buffer length is returned here + * @compr_type: type of compression to use on enter, actually used compression + * type on exit + * + * This function compresses input buffer @in_buf of length @in_len and stores + * the result in the output buffer @out_buf and the resulting length in + * @out_len. If the input buffer does not compress, it is just copied to the + * @out_buf. The same happens if @compr_type is %UBIFS_COMPR_NONE or if + * compression error occurred. + * + * Note, if the input buffer was not compressed, it is copied to the output + * buffer and %UBIFS_COMPR_NONE is returned in @compr_type. + */ +void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, + int *compr_type) +{ + int err; + struct ubifs_compressor *compr = ubifs_compressors[*compr_type]; + + if (*compr_type == UBIFS_COMPR_NONE) + goto no_compr; + + /* If the input data is small, do not even try to compress it */ + if (in_len < UBIFS_MIN_COMPR_LEN) + goto no_compr; + + if (compr->comp_mutex) + mutex_lock(compr->comp_mutex); + err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf, + (unsigned int *)out_len); + if (compr->comp_mutex) + mutex_unlock(compr->comp_mutex); + if (unlikely(err)) { + ubifs_warn("cannot compress %d bytes, compressor %s, " + "error %d, leave data uncompressed", + in_len, compr->name, err); + goto no_compr; + } + + /* + * If the data compressed only slightly, it is better to leave it + * uncompressed to improve read speed. + */ + if (in_len - *out_len < UBIFS_MIN_COMPRESS_DIFF) + goto no_compr; + + return; + +no_compr: + memcpy(out_buf, in_buf, in_len); + *out_len = in_len; + *compr_type = UBIFS_COMPR_NONE; +} + +/** + * ubifs_decompress - decompress data. + * @in_buf: data to decompress + * @in_len: length of the data to decompress + * @out_buf: output buffer where decompressed data should + * @out_len: output length is returned here + * @compr_type: type of compression + * + * This function decompresses data from buffer @in_buf into buffer @out_buf. + * The length of the uncompressed data is returned in @out_len. This functions + * returns %0 on success or a negative error code on failure. + */ +int ubifs_decompress(const void *in_buf, int in_len, void *out_buf, + int *out_len, int compr_type) +{ + int err; + struct ubifs_compressor *compr; + + if (unlikely(compr_type < 0 || compr_type >= UBIFS_COMPR_TYPES_CNT)) { + ubifs_err("invalid compression type %d", compr_type); + return -EINVAL; + } + + compr = ubifs_compressors[compr_type]; + + if (unlikely(!compr->capi_name)) { + ubifs_err("%s compression is not compiled in", compr->name); + return -EINVAL; + } + + if (compr_type == UBIFS_COMPR_NONE) { + memcpy(out_buf, in_buf, in_len); + *out_len = in_len; + return 0; + } + + if (compr->decomp_mutex) + mutex_lock(compr->decomp_mutex); + err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf, + (unsigned int *)out_len); + if (compr->decomp_mutex) + mutex_unlock(compr->decomp_mutex); + if (err) + ubifs_err("cannot decompress %d bytes, compressor %s, " + "error %d", in_len, compr->name, err); + + return err; +} + +/** + * compr_init - initialize a compressor. + * @compr: compressor description object + * + * This function initializes the requested compressor and returns zero in case + * of success or a negative error code in case of failure. + */ +static int __init compr_init(struct ubifs_compressor *compr) +{ + if (compr->capi_name) { + compr->cc = crypto_alloc_comp(compr->capi_name, 0, 0); + if (IS_ERR(compr->cc)) { + ubifs_err("cannot initialize compressor %s, error %ld", + compr->name, PTR_ERR(compr->cc)); + return PTR_ERR(compr->cc); + } + } + + ubifs_compressors[compr->compr_type] = compr; + return 0; +} + +/** + * compr_exit - de-initialize a compressor. + * @compr: compressor description object + */ +static void compr_exit(struct ubifs_compressor *compr) +{ + if (compr->capi_name) + crypto_free_comp(compr->cc); + return; +} + +/** + * ubifs_compressors_init - initialize UBIFS compressors. + * + * This function initializes the compressor which were compiled in. Returns + * zero in case of success and a negative error code in case of failure. + */ +int __init ubifs_compressors_init(void) +{ + int err; + + err = compr_init(&lzo_compr); + if (err) + return err; + + err = compr_init(&zlib_compr); + if (err) + goto out_lzo; + + ubifs_compressors[UBIFS_COMPR_NONE] = &none_compr; + return 0; + +out_lzo: + compr_exit(&lzo_compr); + return err; +} + +/** + * ubifs_compressors_exit - de-initialize UBIFS compressors. + */ +void ubifs_compressors_exit(void) +{ + compr_exit(&lzo_compr); + compr_exit(&zlib_compr); +} diff -Nurd linux-2.6.24/fs/ubifs/debug.c ubifs-v2.6.24/fs/ubifs/debug.c --- linux-2.6.24/fs/ubifs/debug.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/debug.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,2603 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements most of the debugging stuff which is compiled in only + * when it is enabled. But some debugging check functions are implemented in + * corresponding subsystem, just because they are closely related and utilize + * various local functions of those subsystems. + */ + +#define UBIFS_DBG_PRESERVE_UBI + +#include "ubifs.h" +#include +#include +#include + +#ifdef CONFIG_UBIFS_FS_DEBUG + +DEFINE_SPINLOCK(dbg_lock); + +static char dbg_key_buf0[128]; +static char dbg_key_buf1[128]; + +unsigned int ubifs_msg_flags = UBIFS_MSG_FLAGS_DEFAULT; +unsigned int ubifs_chk_flags = UBIFS_CHK_FLAGS_DEFAULT; +unsigned int ubifs_tst_flags; + +module_param_named(debug_msgs, ubifs_msg_flags, uint, S_IRUGO | S_IWUSR); +module_param_named(debug_chks, ubifs_chk_flags, uint, S_IRUGO | S_IWUSR); +module_param_named(debug_tsts, ubifs_tst_flags, uint, S_IRUGO | S_IWUSR); + +MODULE_PARM_DESC(debug_msgs, "Debug message type flags"); +MODULE_PARM_DESC(debug_chks, "Debug check flags"); +MODULE_PARM_DESC(debug_tsts, "Debug special test flags"); + +static const char *get_key_fmt(int fmt) +{ + switch (fmt) { + case UBIFS_SIMPLE_KEY_FMT: + return "simple"; + default: + return "unknown/invalid format"; + } +} + +static const char *get_key_hash(int hash) +{ + switch (hash) { + case UBIFS_KEY_HASH_R5: + return "R5"; + case UBIFS_KEY_HASH_TEST: + return "test"; + default: + return "unknown/invalid name hash"; + } +} + +static const char *get_key_type(int type) +{ + switch (type) { + case UBIFS_INO_KEY: + return "inode"; + case UBIFS_DENT_KEY: + return "direntry"; + case UBIFS_XENT_KEY: + return "xentry"; + case UBIFS_DATA_KEY: + return "data"; + case UBIFS_TRUN_KEY: + return "truncate"; + default: + return "unknown/invalid key"; + } +} + +static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key, + char *buffer) +{ + char *p = buffer; + int type = key_type(c, key); + + if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) { + switch (type) { + case UBIFS_INO_KEY: + sprintf(p, "(%lu, %s)", (unsigned long)key_inum(c, key), + get_key_type(type)); + break; + case UBIFS_DENT_KEY: + case UBIFS_XENT_KEY: + sprintf(p, "(%lu, %s, %#08x)", + (unsigned long)key_inum(c, key), + get_key_type(type), key_hash(c, key)); + break; + case UBIFS_DATA_KEY: + sprintf(p, "(%lu, %s, %u)", + (unsigned long)key_inum(c, key), + get_key_type(type), key_block(c, key)); + break; + case UBIFS_TRUN_KEY: + sprintf(p, "(%lu, %s)", + (unsigned long)key_inum(c, key), + get_key_type(type)); + break; + default: + sprintf(p, "(bad key type: %#08x, %#08x)", + key->u32[0], key->u32[1]); + } + } else + sprintf(p, "bad key format %d", c->key_fmt); +} + +const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key) +{ + /* dbg_lock must be held */ + sprintf_key(c, key, dbg_key_buf0); + return dbg_key_buf0; +} + +const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key) +{ + /* dbg_lock must be held */ + sprintf_key(c, key, dbg_key_buf1); + return dbg_key_buf1; +} + +const char *dbg_ntype(int type) +{ + switch (type) { + case UBIFS_PAD_NODE: + return "padding node"; + case UBIFS_SB_NODE: + return "superblock node"; + case UBIFS_MST_NODE: + return "master node"; + case UBIFS_REF_NODE: + return "reference node"; + case UBIFS_INO_NODE: + return "inode node"; + case UBIFS_DENT_NODE: + return "direntry node"; + case UBIFS_XENT_NODE: + return "xentry node"; + case UBIFS_DATA_NODE: + return "data node"; + case UBIFS_TRUN_NODE: + return "truncate node"; + case UBIFS_IDX_NODE: + return "indexing node"; + case UBIFS_CS_NODE: + return "commit start node"; + case UBIFS_ORPH_NODE: + return "orphan node"; + default: + return "unknown node"; + } +} + +static const char *dbg_gtype(int type) +{ + switch (type) { + case UBIFS_NO_NODE_GROUP: + return "no node group"; + case UBIFS_IN_NODE_GROUP: + return "in node group"; + case UBIFS_LAST_OF_NODE_GROUP: + return "last of node group"; + default: + return "unknown"; + } +} + +const char *dbg_cstate(int cmt_state) +{ + switch (cmt_state) { + case COMMIT_RESTING: + return "commit resting"; + case COMMIT_BACKGROUND: + return "background commit requested"; + case COMMIT_REQUIRED: + return "commit required"; + case COMMIT_RUNNING_BACKGROUND: + return "BACKGROUND commit running"; + case COMMIT_RUNNING_REQUIRED: + return "commit running and required"; + case COMMIT_BROKEN: + return "broken commit"; + default: + return "unknown commit state"; + } +} + +static void dump_ch(const struct ubifs_ch *ch) +{ + printk(KERN_DEBUG "\tmagic %#x\n", le32_to_cpu(ch->magic)); + printk(KERN_DEBUG "\tcrc %#x\n", le32_to_cpu(ch->crc)); + printk(KERN_DEBUG "\tnode_type %d (%s)\n", ch->node_type, + dbg_ntype(ch->node_type)); + printk(KERN_DEBUG "\tgroup_type %d (%s)\n", ch->group_type, + dbg_gtype(ch->group_type)); + printk(KERN_DEBUG "\tsqnum %llu\n", + (unsigned long long)le64_to_cpu(ch->sqnum)); + printk(KERN_DEBUG "\tlen %u\n", le32_to_cpu(ch->len)); +} + +void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode) +{ + const struct ubifs_inode *ui = ubifs_inode(inode); + + printk(KERN_DEBUG "Dump in-memory inode:"); + printk(KERN_DEBUG "\tinode %lu\n", inode->i_ino); + printk(KERN_DEBUG "\tsize %llu\n", + (unsigned long long)i_size_read(inode)); + printk(KERN_DEBUG "\tnlink %u\n", inode->i_nlink); + printk(KERN_DEBUG "\tuid %u\n", (unsigned int)inode->i_uid); + printk(KERN_DEBUG "\tgid %u\n", (unsigned int)inode->i_gid); + printk(KERN_DEBUG "\tatime %u.%u\n", + (unsigned int)inode->i_atime.tv_sec, + (unsigned int)inode->i_atime.tv_nsec); + printk(KERN_DEBUG "\tmtime %u.%u\n", + (unsigned int)inode->i_mtime.tv_sec, + (unsigned int)inode->i_mtime.tv_nsec); + printk(KERN_DEBUG "\tctime %u.%u\n", + (unsigned int)inode->i_ctime.tv_sec, + (unsigned int)inode->i_ctime.tv_nsec); + printk(KERN_DEBUG "\tcreat_sqnum %llu\n", ui->creat_sqnum); + printk(KERN_DEBUG "\txattr_size %u\n", ui->xattr_size); + printk(KERN_DEBUG "\txattr_cnt %u\n", ui->xattr_cnt); + printk(KERN_DEBUG "\txattr_names %u\n", ui->xattr_names); + printk(KERN_DEBUG "\tdirty %u\n", ui->dirty); + printk(KERN_DEBUG "\txattr %u\n", ui->xattr); + printk(KERN_DEBUG "\tbulk_read %u\n", ui->xattr); + printk(KERN_DEBUG "\tsynced_i_size %llu\n", + (unsigned long long)ui->synced_i_size); + printk(KERN_DEBUG "\tui_size %llu\n", + (unsigned long long)ui->ui_size); + printk(KERN_DEBUG "\tflags %d\n", ui->flags); + printk(KERN_DEBUG "\tcompr_type %d\n", ui->compr_type); + printk(KERN_DEBUG "\tlast_page_read %lu\n", ui->last_page_read); + printk(KERN_DEBUG "\tread_in_a_row %lu\n", ui->read_in_a_row); + printk(KERN_DEBUG "\tdata_len %d\n", ui->data_len); +} + +void dbg_dump_node(const struct ubifs_info *c, const void *node) +{ + int i, n; + union ubifs_key key; + const struct ubifs_ch *ch = node; + + if (dbg_failure_mode) + return; + + /* If the magic is incorrect, just hexdump the first bytes */ + if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) { + printk(KERN_DEBUG "Not a node, first %zu bytes:", UBIFS_CH_SZ); + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, + (void *)node, UBIFS_CH_SZ, 1); + return; + } + + spin_lock(&dbg_lock); + dump_ch(node); + + switch (ch->node_type) { + case UBIFS_PAD_NODE: + { + const struct ubifs_pad_node *pad = node; + + printk(KERN_DEBUG "\tpad_len %u\n", + le32_to_cpu(pad->pad_len)); + break; + } + case UBIFS_SB_NODE: + { + const struct ubifs_sb_node *sup = node; + unsigned int sup_flags = le32_to_cpu(sup->flags); + + printk(KERN_DEBUG "\tkey_hash %d (%s)\n", + (int)sup->key_hash, get_key_hash(sup->key_hash)); + printk(KERN_DEBUG "\tkey_fmt %d (%s)\n", + (int)sup->key_fmt, get_key_fmt(sup->key_fmt)); + printk(KERN_DEBUG "\tflags %#x\n", sup_flags); + printk(KERN_DEBUG "\t big_lpt %u\n", + !!(sup_flags & UBIFS_FLG_BIGLPT)); + printk(KERN_DEBUG "\tmin_io_size %u\n", + le32_to_cpu(sup->min_io_size)); + printk(KERN_DEBUG "\tleb_size %u\n", + le32_to_cpu(sup->leb_size)); + printk(KERN_DEBUG "\tleb_cnt %u\n", + le32_to_cpu(sup->leb_cnt)); + printk(KERN_DEBUG "\tmax_leb_cnt %u\n", + le32_to_cpu(sup->max_leb_cnt)); + printk(KERN_DEBUG "\tmax_bud_bytes %llu\n", + (unsigned long long)le64_to_cpu(sup->max_bud_bytes)); + printk(KERN_DEBUG "\tlog_lebs %u\n", + le32_to_cpu(sup->log_lebs)); + printk(KERN_DEBUG "\tlpt_lebs %u\n", + le32_to_cpu(sup->lpt_lebs)); + printk(KERN_DEBUG "\torph_lebs %u\n", + le32_to_cpu(sup->orph_lebs)); + printk(KERN_DEBUG "\tjhead_cnt %u\n", + le32_to_cpu(sup->jhead_cnt)); + printk(KERN_DEBUG "\tfanout %u\n", + le32_to_cpu(sup->fanout)); + printk(KERN_DEBUG "\tlsave_cnt %u\n", + le32_to_cpu(sup->lsave_cnt)); + printk(KERN_DEBUG "\tdefault_compr %u\n", + (int)le16_to_cpu(sup->default_compr)); + printk(KERN_DEBUG "\trp_size %llu\n", + (unsigned long long)le64_to_cpu(sup->rp_size)); + printk(KERN_DEBUG "\trp_uid %u\n", + le32_to_cpu(sup->rp_uid)); + printk(KERN_DEBUG "\trp_gid %u\n", + le32_to_cpu(sup->rp_gid)); + printk(KERN_DEBUG "\tfmt_version %u\n", + le32_to_cpu(sup->fmt_version)); + printk(KERN_DEBUG "\ttime_gran %u\n", + le32_to_cpu(sup->time_gran)); + printk(KERN_DEBUG "\tUUID %02X%02X%02X%02X-%02X%02X" + "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X\n", + sup->uuid[0], sup->uuid[1], sup->uuid[2], sup->uuid[3], + sup->uuid[4], sup->uuid[5], sup->uuid[6], sup->uuid[7], + sup->uuid[8], sup->uuid[9], sup->uuid[10], sup->uuid[11], + sup->uuid[12], sup->uuid[13], sup->uuid[14], + sup->uuid[15]); + break; + } + case UBIFS_MST_NODE: + { + const struct ubifs_mst_node *mst = node; + + printk(KERN_DEBUG "\thighest_inum %llu\n", + (unsigned long long)le64_to_cpu(mst->highest_inum)); + printk(KERN_DEBUG "\tcommit number %llu\n", + (unsigned long long)le64_to_cpu(mst->cmt_no)); + printk(KERN_DEBUG "\tflags %#x\n", + le32_to_cpu(mst->flags)); + printk(KERN_DEBUG "\tlog_lnum %u\n", + le32_to_cpu(mst->log_lnum)); + printk(KERN_DEBUG "\troot_lnum %u\n", + le32_to_cpu(mst->root_lnum)); + printk(KERN_DEBUG "\troot_offs %u\n", + le32_to_cpu(mst->root_offs)); + printk(KERN_DEBUG "\troot_len %u\n", + le32_to_cpu(mst->root_len)); + printk(KERN_DEBUG "\tgc_lnum %u\n", + le32_to_cpu(mst->gc_lnum)); + printk(KERN_DEBUG "\tihead_lnum %u\n", + le32_to_cpu(mst->ihead_lnum)); + printk(KERN_DEBUG "\tihead_offs %u\n", + le32_to_cpu(mst->ihead_offs)); + printk(KERN_DEBUG "\tindex_size %llu\n", + (unsigned long long)le64_to_cpu(mst->index_size)); + printk(KERN_DEBUG "\tlpt_lnum %u\n", + le32_to_cpu(mst->lpt_lnum)); + printk(KERN_DEBUG "\tlpt_offs %u\n", + le32_to_cpu(mst->lpt_offs)); + printk(KERN_DEBUG "\tnhead_lnum %u\n", + le32_to_cpu(mst->nhead_lnum)); + printk(KERN_DEBUG "\tnhead_offs %u\n", + le32_to_cpu(mst->nhead_offs)); + printk(KERN_DEBUG "\tltab_lnum %u\n", + le32_to_cpu(mst->ltab_lnum)); + printk(KERN_DEBUG "\tltab_offs %u\n", + le32_to_cpu(mst->ltab_offs)); + printk(KERN_DEBUG "\tlsave_lnum %u\n", + le32_to_cpu(mst->lsave_lnum)); + printk(KERN_DEBUG "\tlsave_offs %u\n", + le32_to_cpu(mst->lsave_offs)); + printk(KERN_DEBUG "\tlscan_lnum %u\n", + le32_to_cpu(mst->lscan_lnum)); + printk(KERN_DEBUG "\tleb_cnt %u\n", + le32_to_cpu(mst->leb_cnt)); + printk(KERN_DEBUG "\tempty_lebs %u\n", + le32_to_cpu(mst->empty_lebs)); + printk(KERN_DEBUG "\tidx_lebs %u\n", + le32_to_cpu(mst->idx_lebs)); + printk(KERN_DEBUG "\ttotal_free %llu\n", + (unsigned long long)le64_to_cpu(mst->total_free)); + printk(KERN_DEBUG "\ttotal_dirty %llu\n", + (unsigned long long)le64_to_cpu(mst->total_dirty)); + printk(KERN_DEBUG "\ttotal_used %llu\n", + (unsigned long long)le64_to_cpu(mst->total_used)); + printk(KERN_DEBUG "\ttotal_dead %llu\n", + (unsigned long long)le64_to_cpu(mst->total_dead)); + printk(KERN_DEBUG "\ttotal_dark %llu\n", + (unsigned long long)le64_to_cpu(mst->total_dark)); + break; + } + case UBIFS_REF_NODE: + { + const struct ubifs_ref_node *ref = node; + + printk(KERN_DEBUG "\tlnum %u\n", + le32_to_cpu(ref->lnum)); + printk(KERN_DEBUG "\toffs %u\n", + le32_to_cpu(ref->offs)); + printk(KERN_DEBUG "\tjhead %u\n", + le32_to_cpu(ref->jhead)); + break; + } + case UBIFS_INO_NODE: + { + const struct ubifs_ino_node *ino = node; + + key_read(c, &ino->key, &key); + printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); + printk(KERN_DEBUG "\tcreat_sqnum %llu\n", + (unsigned long long)le64_to_cpu(ino->creat_sqnum)); + printk(KERN_DEBUG "\tsize %llu\n", + (unsigned long long)le64_to_cpu(ino->size)); + printk(KERN_DEBUG "\tnlink %u\n", + le32_to_cpu(ino->nlink)); + printk(KERN_DEBUG "\tatime %lld.%u\n", + (long long)le64_to_cpu(ino->atime_sec), + le32_to_cpu(ino->atime_nsec)); + printk(KERN_DEBUG "\tmtime %lld.%u\n", + (long long)le64_to_cpu(ino->mtime_sec), + le32_to_cpu(ino->mtime_nsec)); + printk(KERN_DEBUG "\tctime %lld.%u\n", + (long long)le64_to_cpu(ino->ctime_sec), + le32_to_cpu(ino->ctime_nsec)); + printk(KERN_DEBUG "\tuid %u\n", + le32_to_cpu(ino->uid)); + printk(KERN_DEBUG "\tgid %u\n", + le32_to_cpu(ino->gid)); + printk(KERN_DEBUG "\tmode %u\n", + le32_to_cpu(ino->mode)); + printk(KERN_DEBUG "\tflags %#x\n", + le32_to_cpu(ino->flags)); + printk(KERN_DEBUG "\txattr_cnt %u\n", + le32_to_cpu(ino->xattr_cnt)); + printk(KERN_DEBUG "\txattr_size %u\n", + le32_to_cpu(ino->xattr_size)); + printk(KERN_DEBUG "\txattr_names %u\n", + le32_to_cpu(ino->xattr_names)); + printk(KERN_DEBUG "\tcompr_type %#x\n", + (int)le16_to_cpu(ino->compr_type)); + printk(KERN_DEBUG "\tdata len %u\n", + le32_to_cpu(ino->data_len)); + break; + } + case UBIFS_DENT_NODE: + case UBIFS_XENT_NODE: + { + const struct ubifs_dent_node *dent = node; + int nlen = le16_to_cpu(dent->nlen); + + key_read(c, &dent->key, &key); + printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); + printk(KERN_DEBUG "\tinum %llu\n", + (unsigned long long)le64_to_cpu(dent->inum)); + printk(KERN_DEBUG "\ttype %d\n", (int)dent->type); + printk(KERN_DEBUG "\tnlen %d\n", nlen); + printk(KERN_DEBUG "\tname "); + + if (nlen > UBIFS_MAX_NLEN) + printk(KERN_DEBUG "(bad name length, not printing, " + "bad or corrupted node)"); + else { + for (i = 0; i < nlen && dent->name[i]; i++) + printk(KERN_CONT "%c", dent->name[i]); + } + printk(KERN_CONT "\n"); + + break; + } + case UBIFS_DATA_NODE: + { + const struct ubifs_data_node *dn = node; + int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ; + + key_read(c, &dn->key, &key); + printk(KERN_DEBUG "\tkey %s\n", DBGKEY(&key)); + printk(KERN_DEBUG "\tsize %u\n", + le32_to_cpu(dn->size)); + printk(KERN_DEBUG "\tcompr_typ %d\n", + (int)le16_to_cpu(dn->compr_type)); + printk(KERN_DEBUG "\tdata size %d\n", + dlen); + printk(KERN_DEBUG "\tdata:\n"); + print_hex_dump(KERN_DEBUG, "\t", DUMP_PREFIX_OFFSET, 32, 1, + (void *)&dn->data, dlen, 0); + break; + } + case UBIFS_TRUN_NODE: + { + const struct ubifs_trun_node *trun = node; + + printk(KERN_DEBUG "\tinum %u\n", + le32_to_cpu(trun->inum)); + printk(KERN_DEBUG "\told_size %llu\n", + (unsigned long long)le64_to_cpu(trun->old_size)); + printk(KERN_DEBUG "\tnew_size %llu\n", + (unsigned long long)le64_to_cpu(trun->new_size)); + break; + } + case UBIFS_IDX_NODE: + { + const struct ubifs_idx_node *idx = node; + + n = le16_to_cpu(idx->child_cnt); + printk(KERN_DEBUG "\tchild_cnt %d\n", n); + printk(KERN_DEBUG "\tlevel %d\n", + (int)le16_to_cpu(idx->level)); + printk(KERN_DEBUG "\tBranches:\n"); + + for (i = 0; i < n && i < c->fanout - 1; i++) { + const struct ubifs_branch *br; + + br = ubifs_idx_branch(c, idx, i); + key_read(c, &br->key, &key); + printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n", + i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs), + le32_to_cpu(br->len), DBGKEY(&key)); + } + break; + } + case UBIFS_CS_NODE: + break; + case UBIFS_ORPH_NODE: + { + const struct ubifs_orph_node *orph = node; + + printk(KERN_DEBUG "\tcommit number %llu\n", + (unsigned long long) + le64_to_cpu(orph->cmt_no) & LLONG_MAX); + printk(KERN_DEBUG "\tlast node flag %llu\n", + (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63); + n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3; + printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n); + for (i = 0; i < n; i++) + printk(KERN_DEBUG "\t ino %llu\n", + (unsigned long long)le64_to_cpu(orph->inos[i])); + break; + } + default: + printk(KERN_DEBUG "node type %d was not recognized\n", + (int)ch->node_type); + } + spin_unlock(&dbg_lock); +} + +void dbg_dump_budget_req(const struct ubifs_budget_req *req) +{ + spin_lock(&dbg_lock); + printk(KERN_DEBUG "Budgeting request: new_ino %d, dirtied_ino %d\n", + req->new_ino, req->dirtied_ino); + printk(KERN_DEBUG "\tnew_ino_d %d, dirtied_ino_d %d\n", + req->new_ino_d, req->dirtied_ino_d); + printk(KERN_DEBUG "\tnew_page %d, dirtied_page %d\n", + req->new_page, req->dirtied_page); + printk(KERN_DEBUG "\tnew_dent %d, mod_dent %d\n", + req->new_dent, req->mod_dent); + printk(KERN_DEBUG "\tidx_growth %d\n", req->idx_growth); + printk(KERN_DEBUG "\tdata_growth %d dd_growth %d\n", + req->data_growth, req->dd_growth); + spin_unlock(&dbg_lock); +} + +void dbg_dump_lstats(const struct ubifs_lp_stats *lst) +{ + spin_lock(&dbg_lock); + printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, " + "idx_lebs %d\n", current->pid, lst->empty_lebs, lst->idx_lebs); + printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, " + "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free, + lst->total_dirty); + printk(KERN_DEBUG "\ttotal_used %lld, total_dark %lld, " + "total_dead %lld\n", lst->total_used, lst->total_dark, + lst->total_dead); + spin_unlock(&dbg_lock); +} + +void dbg_dump_budg(struct ubifs_info *c) +{ + int i; + struct rb_node *rb; + struct ubifs_bud *bud; + struct ubifs_gced_idx_leb *idx_gc; + long long available, outstanding, free; + + ubifs_assert(spin_is_locked(&c->space_lock)); + spin_lock(&dbg_lock); + printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, " + "budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid, + c->budg_data_growth, c->budg_dd_growth, c->budg_idx_growth); + printk(KERN_DEBUG "\tdata budget sum %lld, total budget sum %lld, " + "freeable_cnt %d\n", c->budg_data_growth + c->budg_dd_growth, + c->budg_data_growth + c->budg_dd_growth + c->budg_idx_growth, + c->freeable_cnt); + printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %lld, " + "calc_idx_sz %lld, idx_gc_cnt %d\n", c->min_idx_lebs, + c->old_idx_sz, c->calc_idx_sz, c->idx_gc_cnt); + printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, " + "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt), + atomic_long_read(&c->dirty_zn_cnt), + atomic_long_read(&c->clean_zn_cnt)); + printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n", + c->dark_wm, c->dead_wm, c->max_idx_node_sz); + printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n", + c->gc_lnum, c->ihead_lnum); + /* If we are in R/O mode, journal heads do not exist */ + if (c->jheads) + for (i = 0; i < c->jhead_cnt; i++) + printk(KERN_DEBUG "\tjhead %d\t LEB %d\n", + c->jheads[i].wbuf.jhead, c->jheads[i].wbuf.lnum); + for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) { + bud = rb_entry(rb, struct ubifs_bud, rb); + printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum); + } + list_for_each_entry(bud, &c->old_buds, list) + printk(KERN_DEBUG "\told bud LEB %d\n", bud->lnum); + list_for_each_entry(idx_gc, &c->idx_gc, list) + printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n", + idx_gc->lnum, idx_gc->unmap); + printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state); + + /* Print budgeting predictions */ + available = ubifs_calc_available(c, c->min_idx_lebs); + outstanding = c->budg_data_growth + c->budg_dd_growth; + free = ubifs_get_free_space_nolock(c); + printk(KERN_DEBUG "Budgeting predictions:\n"); + printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n", + available, outstanding, free); + spin_unlock(&dbg_lock); +} + +void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp) +{ + printk(KERN_DEBUG "LEB %d lprops: free %d, dirty %d (used %d), " + "flags %#x\n", lp->lnum, lp->free, lp->dirty, + c->leb_size - lp->free - lp->dirty, lp->flags); +} + +void dbg_dump_lprops(struct ubifs_info *c) +{ + int lnum, err; + struct ubifs_lprops lp; + struct ubifs_lp_stats lst; + + printk(KERN_DEBUG "(pid %d) start dumping LEB properties\n", + current->pid); + ubifs_get_lp_stats(c, &lst); + dbg_dump_lstats(&lst); + + for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) { + err = ubifs_read_one_lp(c, lnum, &lp); + if (err) + ubifs_err("cannot read lprops for LEB %d", lnum); + + dbg_dump_lprop(c, &lp); + } + printk(KERN_DEBUG "(pid %d) finish dumping LEB properties\n", + current->pid); +} + +void dbg_dump_lpt_info(struct ubifs_info *c) +{ + int i; + + spin_lock(&dbg_lock); + printk(KERN_DEBUG "(pid %d) dumping LPT information\n", current->pid); + printk(KERN_DEBUG "\tlpt_sz: %lld\n", c->lpt_sz); + printk(KERN_DEBUG "\tpnode_sz: %d\n", c->pnode_sz); + printk(KERN_DEBUG "\tnnode_sz: %d\n", c->nnode_sz); + printk(KERN_DEBUG "\tltab_sz: %d\n", c->ltab_sz); + printk(KERN_DEBUG "\tlsave_sz: %d\n", c->lsave_sz); + printk(KERN_DEBUG "\tbig_lpt: %d\n", c->big_lpt); + printk(KERN_DEBUG "\tlpt_hght: %d\n", c->lpt_hght); + printk(KERN_DEBUG "\tpnode_cnt: %d\n", c->pnode_cnt); + printk(KERN_DEBUG "\tnnode_cnt: %d\n", c->nnode_cnt); + printk(KERN_DEBUG "\tdirty_pn_cnt: %d\n", c->dirty_pn_cnt); + printk(KERN_DEBUG "\tdirty_nn_cnt: %d\n", c->dirty_nn_cnt); + printk(KERN_DEBUG "\tlsave_cnt: %d\n", c->lsave_cnt); + printk(KERN_DEBUG "\tspace_bits: %d\n", c->space_bits); + printk(KERN_DEBUG "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits); + printk(KERN_DEBUG "\tlpt_offs_bits: %d\n", c->lpt_offs_bits); + printk(KERN_DEBUG "\tlpt_spc_bits: %d\n", c->lpt_spc_bits); + printk(KERN_DEBUG "\tpcnt_bits: %d\n", c->pcnt_bits); + printk(KERN_DEBUG "\tlnum_bits: %d\n", c->lnum_bits); + printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs); + printk(KERN_DEBUG "\tLPT head is at %d:%d\n", + c->nhead_lnum, c->nhead_offs); + printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", + c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n", + c->lsave_lnum, c->lsave_offs); + for (i = 0; i < c->lpt_lebs; i++) + printk(KERN_DEBUG "\tLPT LEB %d free %d dirty %d tgc %d " + "cmt %d\n", i + c->lpt_first, c->ltab[i].free, + c->ltab[i].dirty, c->ltab[i].tgc, c->ltab[i].cmt); + spin_unlock(&dbg_lock); +} + +void dbg_dump_leb(const struct ubifs_info *c, int lnum) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + + if (dbg_failure_mode) + return; + + printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", + current->pid, lnum); + sleb = ubifs_scan(c, lnum, 0, c->dbg->buf); + if (IS_ERR(sleb)) { + ubifs_err("scan error %d", (int)PTR_ERR(sleb)); + return; + } + + printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum, + sleb->nodes_cnt, sleb->endpt); + + list_for_each_entry(snod, &sleb->nodes, list) { + cond_resched(); + printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", lnum, + snod->offs, snod->len); + dbg_dump_node(c, snod->node); + } + + printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", + current->pid, lnum); + ubifs_scan_destroy(sleb); + return; +} + +void dbg_dump_znode(const struct ubifs_info *c, + const struct ubifs_znode *znode) +{ + int n; + const struct ubifs_zbranch *zbr; + + spin_lock(&dbg_lock); + if (znode->parent) + zbr = &znode->parent->zbranch[znode->iip]; + else + zbr = &c->zroot; + + printk(KERN_DEBUG "znode %p, LEB %d:%d len %d parent %p iip %d level %d" + " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs, + zbr->len, znode->parent, znode->iip, znode->level, + znode->child_cnt, znode->flags); + + if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) { + spin_unlock(&dbg_lock); + return; + } + + printk(KERN_DEBUG "zbranches:\n"); + for (n = 0; n < znode->child_cnt; n++) { + zbr = &znode->zbranch[n]; + if (znode->level > 0) + printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key " + "%s\n", n, zbr->znode, zbr->lnum, + zbr->offs, zbr->len, + DBGKEY(&zbr->key)); + else + printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key " + "%s\n", n, zbr->znode, zbr->lnum, + zbr->offs, zbr->len, + DBGKEY(&zbr->key)); + } + spin_unlock(&dbg_lock); +} + +void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat) +{ + int i; + + printk(KERN_DEBUG "(pid %d) start dumping heap cat %d (%d elements)\n", + current->pid, cat, heap->cnt); + for (i = 0; i < heap->cnt; i++) { + struct ubifs_lprops *lprops = heap->arr[i]; + + printk(KERN_DEBUG "\t%d. LEB %d hpos %d free %d dirty %d " + "flags %d\n", i, lprops->lnum, lprops->hpos, + lprops->free, lprops->dirty, lprops->flags); + } + printk(KERN_DEBUG "(pid %d) finish dumping heap\n", current->pid); +} + +void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, int iip) +{ + int i; + + printk(KERN_DEBUG "(pid %d) dumping pnode:\n", current->pid); + printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n", + (size_t)pnode, (size_t)parent, (size_t)pnode->cnext); + printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n", + pnode->flags, iip, pnode->level, pnode->num); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops *lp = &pnode->lprops[i]; + + printk(KERN_DEBUG "\t%d: free %d dirty %d flags %d lnum %d\n", + i, lp->free, lp->dirty, lp->flags, lp->lnum); + } +} + +void dbg_dump_tnc(struct ubifs_info *c) +{ + struct ubifs_znode *znode; + int level; + + printk(KERN_DEBUG "\n"); + printk(KERN_DEBUG "(pid %d) start dumping TNC tree\n", current->pid); + znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); + level = znode->level; + printk(KERN_DEBUG "== Level %d ==\n", level); + while (znode) { + if (level != znode->level) { + level = znode->level; + printk(KERN_DEBUG "== Level %d ==\n", level); + } + dbg_dump_znode(c, znode); + znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); + } + printk(KERN_DEBUG "(pid %d) finish dumping TNC tree\n", current->pid); +} + +static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode, + void *priv) +{ + dbg_dump_znode(c, znode); + return 0; +} + +/** + * dbg_dump_index - dump the on-flash index. + * @c: UBIFS file-system description object + * + * This function dumps whole UBIFS indexing B-tree, unlike 'dbg_dump_tnc()' + * which dumps only in-memory znodes and does not read znodes which from flash. + */ +void dbg_dump_index(struct ubifs_info *c) +{ + dbg_walk_index(c, NULL, dump_znode, NULL); +} + +/** + * dbg_save_space_info - save information about flash space. + * @c: UBIFS file-system description object + * + * This function saves information about UBIFS free space, dirty space, etc, in + * order to check it later. + */ +void dbg_save_space_info(struct ubifs_info *c) +{ + struct ubifs_debug_info *d = c->dbg; + + ubifs_get_lp_stats(c, &d->saved_lst); + + spin_lock(&c->space_lock); + d->saved_free = ubifs_get_free_space_nolock(c); + spin_unlock(&c->space_lock); +} + +/** + * dbg_check_space_info - check flash space information. + * @c: UBIFS file-system description object + * + * This function compares current flash space information with the information + * which was saved when the 'dbg_save_space_info()' function was called. + * Returns zero if the information has not changed, and %-EINVAL it it has + * changed. + */ +int dbg_check_space_info(struct ubifs_info *c) +{ + struct ubifs_debug_info *d = c->dbg; + struct ubifs_lp_stats lst; + long long avail, free; + + spin_lock(&c->space_lock); + avail = ubifs_calc_available(c, c->min_idx_lebs); + spin_unlock(&c->space_lock); + free = ubifs_get_free_space(c); + + if (free != d->saved_free) { + ubifs_err("free space changed from %lld to %lld", + d->saved_free, free); + goto out; + } + + return 0; + +out: + ubifs_msg("saved lprops statistics dump"); + dbg_dump_lstats(&d->saved_lst); + ubifs_get_lp_stats(c, &lst); + ubifs_msg("current lprops statistics dump"); + dbg_dump_lstats(&d->saved_lst); + spin_lock(&c->space_lock); + dbg_dump_budg(c); + spin_unlock(&c->space_lock); + dump_stack(); + return -EINVAL; +} + +/** + * dbg_check_synced_i_size - check synchronized inode size. + * @inode: inode to check + * + * If inode is clean, synchronized inode size has to be equivalent to current + * inode size. This function has to be called only for locked inodes (@i_mutex + * has to be locked). Returns %0 if synchronized inode size if correct, and + * %-EINVAL if not. + */ +int dbg_check_synced_i_size(struct inode *inode) +{ + int err = 0; + struct ubifs_inode *ui = ubifs_inode(inode); + + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + if (!S_ISREG(inode->i_mode)) + return 0; + + mutex_lock(&ui->ui_mutex); + spin_lock(&ui->ui_lock); + if (ui->ui_size != ui->synced_i_size && !ui->dirty) { + ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode " + "is clean", ui->ui_size, ui->synced_i_size); + ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino, + inode->i_mode, i_size_read(inode)); + dbg_dump_stack(); + err = -EINVAL; + } + spin_unlock(&ui->ui_lock); + mutex_unlock(&ui->ui_mutex); + return err; +} + +/* + * dbg_check_dir - check directory inode size and link count. + * @c: UBIFS file-system description object + * @dir: the directory to calculate size for + * @size: the result is returned here + * + * This function makes sure that directory size and link count are correct. + * Returns zero in case of success and a negative error code in case of + * failure. + * + * Note, it is good idea to make sure the @dir->i_mutex is locked before + * calling this function. + */ +int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir) +{ + unsigned int nlink = 2; + union ubifs_key key; + struct ubifs_dent_node *dent, *pdent = NULL; + struct qstr nm = { .name = NULL }; + loff_t size = UBIFS_INO_NODE_SZ; + + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + + if (!S_ISDIR(dir->i_mode)) + return 0; + + lowest_dent_key(c, &key, dir->i_ino); + while (1) { + int err; + + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + if (err == -ENOENT) + break; + return err; + } + + nm.name = dent->name; + nm.len = le16_to_cpu(dent->nlen); + size += CALC_DENT_SIZE(nm.len); + if (dent->type == UBIFS_ITYPE_DIR) + nlink += 1; + kfree(pdent); + pdent = dent; + key_read(c, &dent->key, &key); + } + kfree(pdent); + + if (i_size_read(dir) != size) { + ubifs_err("directory inode %lu has size %llu, " + "but calculated size is %llu", dir->i_ino, + (unsigned long long)i_size_read(dir), + (unsigned long long)size); + dump_stack(); + return -EINVAL; + } + if (dir->i_nlink != nlink) { + ubifs_err("directory inode %lu has nlink %u, but calculated " + "nlink is %u", dir->i_ino, dir->i_nlink, nlink); + dump_stack(); + return -EINVAL; + } + + return 0; +} + +/** + * dbg_check_key_order - make sure that colliding keys are properly ordered. + * @c: UBIFS file-system description object + * @zbr1: first zbranch + * @zbr2: following zbranch + * + * In UBIFS indexing B-tree colliding keys has to be sorted in binary order of + * names of the direntries/xentries which are referred by the keys. This + * function reads direntries/xentries referred by @zbr1 and @zbr2 and makes + * sure the name of direntry/xentry referred by @zbr1 is less than + * direntry/xentry referred by @zbr2. Returns zero if this is true, %1 if not, + * and a negative error code in case of failure. + */ +static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, + struct ubifs_zbranch *zbr2) +{ + int err, nlen1, nlen2, cmp; + struct ubifs_dent_node *dent1, *dent2; + union ubifs_key key; + + ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key)); + dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); + if (!dent1) + return -ENOMEM; + dent2 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); + if (!dent2) { + err = -ENOMEM; + goto out_free; + } + + err = ubifs_tnc_read_node(c, zbr1, dent1); + if (err) + goto out_free; + err = ubifs_validate_entry(c, dent1); + if (err) + goto out_free; + + err = ubifs_tnc_read_node(c, zbr2, dent2); + if (err) + goto out_free; + err = ubifs_validate_entry(c, dent2); + if (err) + goto out_free; + + /* Make sure node keys are the same as in zbranch */ + err = 1; + key_read(c, &dent1->key, &key); + if (keys_cmp(c, &zbr1->key, &key)) { + dbg_err("1st entry at %d:%d has key %s", zbr1->lnum, + zbr1->offs, DBGKEY(&key)); + dbg_err("but it should have key %s according to tnc", + DBGKEY(&zbr1->key)); + dbg_dump_node(c, dent1); + goto out_free; + } + + key_read(c, &dent2->key, &key); + if (keys_cmp(c, &zbr2->key, &key)) { + dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum, + zbr1->offs, DBGKEY(&key)); + dbg_err("but it should have key %s according to tnc", + DBGKEY(&zbr2->key)); + dbg_dump_node(c, dent2); + goto out_free; + } + + nlen1 = le16_to_cpu(dent1->nlen); + nlen2 = le16_to_cpu(dent2->nlen); + + cmp = memcmp(dent1->name, dent2->name, min_t(int, nlen1, nlen2)); + if (cmp < 0 || (cmp == 0 && nlen1 < nlen2)) { + err = 0; + goto out_free; + } + if (cmp == 0 && nlen1 == nlen2) + dbg_err("2 xent/dent nodes with the same name"); + else + dbg_err("bad order of colliding key %s", + DBGKEY(&key)); + + ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs); + dbg_dump_node(c, dent1); + ubifs_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs); + dbg_dump_node(c, dent2); + +out_free: + kfree(dent2); + kfree(dent1); + return err; +} + +/** + * dbg_check_znode - check if znode is all right. + * @c: UBIFS file-system description object + * @zbr: zbranch which points to this znode + * + * This function makes sure that znode referred to by @zbr is all right. + * Returns zero if it is, and %-EINVAL if it is not. + */ +static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr) +{ + struct ubifs_znode *znode = zbr->znode; + struct ubifs_znode *zp = znode->parent; + int n, err, cmp; + + if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) { + err = 1; + goto out; + } + if (znode->level < 0) { + err = 2; + goto out; + } + if (znode->iip < 0 || znode->iip >= c->fanout) { + err = 3; + goto out; + } + + if (zbr->len == 0) + /* Only dirty zbranch may have no on-flash nodes */ + if (!ubifs_zn_dirty(znode)) { + err = 4; + goto out; + } + + if (ubifs_zn_dirty(znode)) { + /* + * If znode is dirty, its parent has to be dirty as well. The + * order of the operation is important, so we have to have + * memory barriers. + */ + smp_mb(); + if (zp && !ubifs_zn_dirty(zp)) { + /* + * The dirty flag is atomic and is cleared outside the + * TNC mutex, so znode's dirty flag may now have + * been cleared. The child is always cleared before the + * parent, so we just need to check again. + */ + smp_mb(); + if (ubifs_zn_dirty(znode)) { + err = 5; + goto out; + } + } + } + + if (zp) { + const union ubifs_key *min, *max; + + if (znode->level != zp->level - 1) { + err = 6; + goto out; + } + + /* Make sure the 'parent' pointer in our znode is correct */ + err = ubifs_search_zbranch(c, zp, &zbr->key, &n); + if (!err) { + /* This zbranch does not exist in the parent */ + err = 7; + goto out; + } + + if (znode->iip >= zp->child_cnt) { + err = 8; + goto out; + } + + if (znode->iip != n) { + /* This may happen only in case of collisions */ + if (keys_cmp(c, &zp->zbranch[n].key, + &zp->zbranch[znode->iip].key)) { + err = 9; + goto out; + } + n = znode->iip; + } + + /* + * Make sure that the first key in our znode is greater than or + * equal to the key in the pointing zbranch. + */ + min = &zbr->key; + cmp = keys_cmp(c, min, &znode->zbranch[0].key); + if (cmp == 1) { + err = 10; + goto out; + } + + if (n + 1 < zp->child_cnt) { + max = &zp->zbranch[n + 1].key; + + /* + * Make sure the last key in our znode is less or + * equivalent than the key in the zbranch which goes + * after our pointing zbranch. + */ + cmp = keys_cmp(c, max, + &znode->zbranch[znode->child_cnt - 1].key); + if (cmp == -1) { + err = 11; + goto out; + } + } + } else { + /* This may only be root znode */ + if (zbr != &c->zroot) { + err = 12; + goto out; + } + } + + /* + * Make sure that next key is greater or equivalent then the previous + * one. + */ + for (n = 1; n < znode->child_cnt; n++) { + cmp = keys_cmp(c, &znode->zbranch[n - 1].key, + &znode->zbranch[n].key); + if (cmp > 0) { + err = 13; + goto out; + } + if (cmp == 0) { + /* This can only be keys with colliding hash */ + if (!is_hash_key(c, &znode->zbranch[n].key)) { + err = 14; + goto out; + } + + if (znode->level != 0 || c->replaying) + continue; + + /* + * Colliding keys should follow binary order of + * corresponding xentry/dentry names. + */ + err = dbg_check_key_order(c, &znode->zbranch[n - 1], + &znode->zbranch[n]); + if (err < 0) + return err; + if (err) { + err = 15; + goto out; + } + } + } + + for (n = 0; n < znode->child_cnt; n++) { + if (!znode->zbranch[n].znode && + (znode->zbranch[n].lnum == 0 || + znode->zbranch[n].len == 0)) { + err = 16; + goto out; + } + + if (znode->zbranch[n].lnum != 0 && + znode->zbranch[n].len == 0) { + err = 17; + goto out; + } + + if (znode->zbranch[n].lnum == 0 && + znode->zbranch[n].len != 0) { + err = 18; + goto out; + } + + if (znode->zbranch[n].lnum == 0 && + znode->zbranch[n].offs != 0) { + err = 19; + goto out; + } + + if (znode->level != 0 && znode->zbranch[n].znode) + if (znode->zbranch[n].znode->parent != znode) { + err = 20; + goto out; + } + } + + return 0; + +out: + ubifs_err("failed, error %d", err); + ubifs_msg("dump of the znode"); + dbg_dump_znode(c, znode); + if (zp) { + ubifs_msg("dump of the parent znode"); + dbg_dump_znode(c, zp); + } + dump_stack(); + return -EINVAL; +} + +/** + * dbg_check_tnc - check TNC tree. + * @c: UBIFS file-system description object + * @extra: do extra checks that are possible at start commit + * + * This function traverses whole TNC tree and checks every znode. Returns zero + * if everything is all right and %-EINVAL if something is wrong with TNC. + */ +int dbg_check_tnc(struct ubifs_info *c, int extra) +{ + struct ubifs_znode *znode; + long clean_cnt = 0, dirty_cnt = 0; + int err, last; + + if (!(ubifs_chk_flags & UBIFS_CHK_TNC)) + return 0; + + ubifs_assert(mutex_is_locked(&c->tnc_mutex)); + if (!c->zroot.znode) + return 0; + + znode = ubifs_tnc_postorder_first(c->zroot.znode); + while (1) { + struct ubifs_znode *prev; + struct ubifs_zbranch *zbr; + + if (!znode->parent) + zbr = &c->zroot; + else + zbr = &znode->parent->zbranch[znode->iip]; + + err = dbg_check_znode(c, zbr); + if (err) + return err; + + if (extra) { + if (ubifs_zn_dirty(znode)) + dirty_cnt += 1; + else + clean_cnt += 1; + } + + prev = znode; + znode = ubifs_tnc_postorder_next(znode); + if (!znode) + break; + + /* + * If the last key of this znode is equivalent to the first key + * of the next znode (collision), then check order of the keys. + */ + last = prev->child_cnt - 1; + if (prev->level == 0 && znode->level == 0 && !c->replaying && + !keys_cmp(c, &prev->zbranch[last].key, + &znode->zbranch[0].key)) { + err = dbg_check_key_order(c, &prev->zbranch[last], + &znode->zbranch[0]); + if (err < 0) + return err; + if (err) { + ubifs_msg("first znode"); + dbg_dump_znode(c, prev); + ubifs_msg("second znode"); + dbg_dump_znode(c, znode); + return -EINVAL; + } + } + } + + if (extra) { + if (clean_cnt != atomic_long_read(&c->clean_zn_cnt)) { + ubifs_err("incorrect clean_zn_cnt %ld, calculated %ld", + atomic_long_read(&c->clean_zn_cnt), + clean_cnt); + return -EINVAL; + } + if (dirty_cnt != atomic_long_read(&c->dirty_zn_cnt)) { + ubifs_err("incorrect dirty_zn_cnt %ld, calculated %ld", + atomic_long_read(&c->dirty_zn_cnt), + dirty_cnt); + return -EINVAL; + } + } + + return 0; +} + +/** + * dbg_walk_index - walk the on-flash index. + * @c: UBIFS file-system description object + * @leaf_cb: called for each leaf node + * @znode_cb: called for each indexing node + * @priv: private data which is passed to callbacks + * + * This function walks the UBIFS index and calls the @leaf_cb for each leaf + * node and @znode_cb for each indexing node. Returns zero in case of success + * and a negative error code in case of failure. + * + * It would be better if this function removed every znode it pulled to into + * the TNC, so that the behavior more closely matched the non-debugging + * behavior. + */ +int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, + dbg_znode_callback znode_cb, void *priv) +{ + int err; + struct ubifs_zbranch *zbr; + struct ubifs_znode *znode, *child; + + mutex_lock(&c->tnc_mutex); + /* If the root indexing node is not in TNC - pull it */ + if (!c->zroot.znode) { + c->zroot.znode = ubifs_load_znode(c, &c->zroot, NULL, 0); + if (IS_ERR(c->zroot.znode)) { + err = PTR_ERR(c->zroot.znode); + c->zroot.znode = NULL; + goto out_unlock; + } + } + + /* + * We are going to traverse the indexing tree in the postorder manner. + * Go down and find the leftmost indexing node where we are going to + * start from. + */ + znode = c->zroot.znode; + while (znode->level > 0) { + zbr = &znode->zbranch[0]; + child = zbr->znode; + if (!child) { + child = ubifs_load_znode(c, zbr, znode, 0); + if (IS_ERR(child)) { + err = PTR_ERR(child); + goto out_unlock; + } + zbr->znode = child; + } + + znode = child; + } + + /* Iterate over all indexing nodes */ + while (1) { + int idx; + + cond_resched(); + + if (znode_cb) { + err = znode_cb(c, znode, priv); + if (err) { + ubifs_err("znode checking function returned " + "error %d", err); + dbg_dump_znode(c, znode); + goto out_dump; + } + } + if (leaf_cb && znode->level == 0) { + for (idx = 0; idx < znode->child_cnt; idx++) { + zbr = &znode->zbranch[idx]; + err = leaf_cb(c, zbr, priv); + if (err) { + ubifs_err("leaf checking function " + "returned error %d, for leaf " + "at LEB %d:%d", + err, zbr->lnum, zbr->offs); + goto out_dump; + } + } + } + + if (!znode->parent) + break; + + idx = znode->iip + 1; + znode = znode->parent; + if (idx < znode->child_cnt) { + /* Switch to the next index in the parent */ + zbr = &znode->zbranch[idx]; + child = zbr->znode; + if (!child) { + child = ubifs_load_znode(c, zbr, znode, idx); + if (IS_ERR(child)) { + err = PTR_ERR(child); + goto out_unlock; + } + zbr->znode = child; + } + znode = child; + } else + /* + * This is the last child, switch to the parent and + * continue. + */ + continue; + + /* Go to the lowest leftmost znode in the new sub-tree */ + while (znode->level > 0) { + zbr = &znode->zbranch[0]; + child = zbr->znode; + if (!child) { + child = ubifs_load_znode(c, zbr, znode, 0); + if (IS_ERR(child)) { + err = PTR_ERR(child); + goto out_unlock; + } + zbr->znode = child; + } + znode = child; + } + } + + mutex_unlock(&c->tnc_mutex); + return 0; + +out_dump: + if (znode->parent) + zbr = &znode->parent->zbranch[znode->iip]; + else + zbr = &c->zroot; + ubifs_msg("dump of znode at LEB %d:%d", zbr->lnum, zbr->offs); + dbg_dump_znode(c, znode); +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * add_size - add znode size to partially calculated index size. + * @c: UBIFS file-system description object + * @znode: znode to add size for + * @priv: partially calculated index size + * + * This is a helper function for 'dbg_check_idx_size()' which is called for + * every indexing node and adds its size to the 'long long' variable pointed to + * by @priv. + */ +static int add_size(struct ubifs_info *c, struct ubifs_znode *znode, void *priv) +{ + long long *idx_size = priv; + int add; + + add = ubifs_idx_node_sz(c, znode->child_cnt); + add = ALIGN(add, 8); + *idx_size += add; + return 0; +} + +/** + * dbg_check_idx_size - check index size. + * @c: UBIFS file-system description object + * @idx_size: size to check + * + * This function walks the UBIFS index, calculates its size and checks that the + * size is equivalent to @idx_size. Returns zero in case of success and a + * negative error code in case of failure. + */ +int dbg_check_idx_size(struct ubifs_info *c, long long idx_size) +{ + int err; + long long calc = 0; + + if (!(ubifs_chk_flags & UBIFS_CHK_IDX_SZ)) + return 0; + + err = dbg_walk_index(c, NULL, add_size, &calc); + if (err) { + ubifs_err("error %d while walking the index", err); + return err; + } + + if (calc != idx_size) { + ubifs_err("index size check failed: calculated size is %lld, " + "should be %lld", calc, idx_size); + dump_stack(); + return -EINVAL; + } + + return 0; +} + +/** + * struct fsck_inode - information about an inode used when checking the file-system. + * @rb: link in the RB-tree of inodes + * @inum: inode number + * @mode: inode type, permissions, etc + * @nlink: inode link count + * @xattr_cnt: count of extended attributes + * @references: how many directory/xattr entries refer this inode (calculated + * while walking the index) + * @calc_cnt: for directory inode count of child directories + * @size: inode size (read from on-flash inode) + * @xattr_sz: summary size of all extended attributes (read from on-flash + * inode) + * @calc_sz: for directories calculated directory size + * @calc_xcnt: count of extended attributes + * @calc_xsz: calculated summary size of all extended attributes + * @xattr_nms: sum of lengths of all extended attribute names belonging to this + * inode (read from on-flash inode) + * @calc_xnms: calculated sum of lengths of all extended attribute names + */ +struct fsck_inode { + struct rb_node rb; + ino_t inum; + umode_t mode; + unsigned int nlink; + unsigned int xattr_cnt; + int references; + int calc_cnt; + long long size; + unsigned int xattr_sz; + long long calc_sz; + long long calc_xcnt; + long long calc_xsz; + unsigned int xattr_nms; + long long calc_xnms; +}; + +/** + * struct fsck_data - private FS checking information. + * @inodes: RB-tree of all inodes (contains @struct fsck_inode objects) + */ +struct fsck_data { + struct rb_root inodes; +}; + +/** + * add_inode - add inode information to RB-tree of inodes. + * @c: UBIFS file-system description object + * @fsckd: FS checking information + * @ino: raw UBIFS inode to add + * + * This is a helper function for 'check_leaf()' which adds information about + * inode @ino to the RB-tree of inodes. Returns inode information pointer in + * case of success and a negative error code in case of failure. + */ +static struct fsck_inode *add_inode(struct ubifs_info *c, + struct fsck_data *fsckd, + struct ubifs_ino_node *ino) +{ + struct rb_node **p, *parent = NULL; + struct fsck_inode *fscki; + ino_t inum = key_inum_flash(c, &ino->key); + + p = &fsckd->inodes.rb_node; + while (*p) { + parent = *p; + fscki = rb_entry(parent, struct fsck_inode, rb); + if (inum < fscki->inum) + p = &(*p)->rb_left; + else if (inum > fscki->inum) + p = &(*p)->rb_right; + else + return fscki; + } + + if (inum > c->highest_inum) { + ubifs_err("too high inode number, max. is %lu", + (unsigned long)c->highest_inum); + return ERR_PTR(-EINVAL); + } + + fscki = kzalloc(sizeof(struct fsck_inode), GFP_NOFS); + if (!fscki) + return ERR_PTR(-ENOMEM); + + fscki->inum = inum; + fscki->nlink = le32_to_cpu(ino->nlink); + fscki->size = le64_to_cpu(ino->size); + fscki->xattr_cnt = le32_to_cpu(ino->xattr_cnt); + fscki->xattr_sz = le32_to_cpu(ino->xattr_size); + fscki->xattr_nms = le32_to_cpu(ino->xattr_names); + fscki->mode = le32_to_cpu(ino->mode); + if (S_ISDIR(fscki->mode)) { + fscki->calc_sz = UBIFS_INO_NODE_SZ; + fscki->calc_cnt = 2; + } + rb_link_node(&fscki->rb, parent, p); + rb_insert_color(&fscki->rb, &fsckd->inodes); + return fscki; +} + +/** + * search_inode - search inode in the RB-tree of inodes. + * @fsckd: FS checking information + * @inum: inode number to search + * + * This is a helper function for 'check_leaf()' which searches inode @inum in + * the RB-tree of inodes and returns an inode information pointer or %NULL if + * the inode was not found. + */ +static struct fsck_inode *search_inode(struct fsck_data *fsckd, ino_t inum) +{ + struct rb_node *p; + struct fsck_inode *fscki; + + p = fsckd->inodes.rb_node; + while (p) { + fscki = rb_entry(p, struct fsck_inode, rb); + if (inum < fscki->inum) + p = p->rb_left; + else if (inum > fscki->inum) + p = p->rb_right; + else + return fscki; + } + return NULL; +} + +/** + * read_add_inode - read inode node and add it to RB-tree of inodes. + * @c: UBIFS file-system description object + * @fsckd: FS checking information + * @inum: inode number to read + * + * This is a helper function for 'check_leaf()' which finds inode node @inum in + * the index, reads it, and adds it to the RB-tree of inodes. Returns inode + * information pointer in case of success and a negative error code in case of + * failure. + */ +static struct fsck_inode *read_add_inode(struct ubifs_info *c, + struct fsck_data *fsckd, ino_t inum) +{ + int n, err; + union ubifs_key key; + struct ubifs_znode *znode; + struct ubifs_zbranch *zbr; + struct ubifs_ino_node *ino; + struct fsck_inode *fscki; + + fscki = search_inode(fsckd, inum); + if (fscki) + return fscki; + + ino_key_init(c, &key, inum); + err = ubifs_lookup_level0(c, &key, &znode, &n); + if (!err) { + ubifs_err("inode %lu not found in index", (unsigned long)inum); + return ERR_PTR(-ENOENT); + } else if (err < 0) { + ubifs_err("error %d while looking up inode %lu", + err, (unsigned long)inum); + return ERR_PTR(err); + } + + zbr = &znode->zbranch[n]; + if (zbr->len < UBIFS_INO_NODE_SZ) { + ubifs_err("bad node %lu node length %d", + (unsigned long)inum, zbr->len); + return ERR_PTR(-EINVAL); + } + + ino = kmalloc(zbr->len, GFP_NOFS); + if (!ino) + return ERR_PTR(-ENOMEM); + + err = ubifs_tnc_read_node(c, zbr, ino); + if (err) { + ubifs_err("cannot read inode node at LEB %d:%d, error %d", + zbr->lnum, zbr->offs, err); + kfree(ino); + return ERR_PTR(err); + } + + fscki = add_inode(c, fsckd, ino); + kfree(ino); + if (IS_ERR(fscki)) { + ubifs_err("error %ld while adding inode %lu node", + PTR_ERR(fscki), (unsigned long)inum); + return fscki; + } + + return fscki; +} + +/** + * check_leaf - check leaf node. + * @c: UBIFS file-system description object + * @zbr: zbranch of the leaf node to check + * @priv: FS checking information + * + * This is a helper function for 'dbg_check_filesystem()' which is called for + * every single leaf node while walking the indexing tree. It checks that the + * leaf node referred from the indexing tree exists, has correct CRC, and does + * some other basic validation. This function is also responsible for building + * an RB-tree of inodes - it adds all inodes into the RB-tree. It also + * calculates reference count, size, etc for each inode in order to later + * compare them to the information stored inside the inodes and detect possible + * inconsistencies. Returns zero in case of success and a negative error code + * in case of failure. + */ +static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *priv) +{ + ino_t inum; + void *node; + int err, type = key_type(c, &zbr->key); + struct fsck_inode *fscki; + + if (zbr->len < UBIFS_CH_SZ) { + ubifs_err("bad leaf length %d (LEB %d:%d)", + zbr->len, zbr->lnum, zbr->offs); + return -EINVAL; + } + + node = kmalloc(zbr->len, GFP_NOFS); + if (!node) + return -ENOMEM; + + err = ubifs_tnc_read_node(c, zbr, node); + if (err) { + ubifs_err("cannot read leaf node at LEB %d:%d, error %d", + zbr->lnum, zbr->offs, err); + goto out_free; + } + + /* If this is an inode node, add it to RB-tree of inodes */ + if (type == UBIFS_INO_KEY) { + fscki = add_inode(c, priv, node); + if (IS_ERR(fscki)) { + err = PTR_ERR(fscki); + ubifs_err("error %d while adding inode node", err); + goto out_dump; + } + goto out; + } + + if (type != UBIFS_DENT_KEY && type != UBIFS_XENT_KEY && + type != UBIFS_DATA_KEY) { + ubifs_err("unexpected node type %d at LEB %d:%d", + type, zbr->lnum, zbr->offs); + err = -EINVAL; + goto out_free; + } + + if (type == UBIFS_DATA_KEY) { + long long blk_offs; + struct ubifs_data_node *dn = node; + + /* + * Search the inode node this data node belongs to and insert + * it to the RB-tree of inodes. + */ + inum = key_inum_flash(c, &dn->key); + fscki = read_add_inode(c, priv, inum); + if (IS_ERR(fscki)) { + err = PTR_ERR(fscki); + ubifs_err("error %d while processing data node and " + "trying to find inode node %lu", + err, (unsigned long)inum); + goto out_dump; + } + + /* Make sure the data node is within inode size */ + blk_offs = key_block_flash(c, &dn->key); + blk_offs <<= UBIFS_BLOCK_SHIFT; + blk_offs += le32_to_cpu(dn->size); + if (blk_offs > fscki->size) { + ubifs_err("data node at LEB %d:%d is not within inode " + "size %lld", zbr->lnum, zbr->offs, + fscki->size); + err = -EINVAL; + goto out_dump; + } + } else { + int nlen; + struct ubifs_dent_node *dent = node; + struct fsck_inode *fscki1; + + err = ubifs_validate_entry(c, dent); + if (err) + goto out_dump; + + /* + * Search the inode node this entry refers to and the parent + * inode node and insert them to the RB-tree of inodes. + */ + inum = le64_to_cpu(dent->inum); + fscki = read_add_inode(c, priv, inum); + if (IS_ERR(fscki)) { + err = PTR_ERR(fscki); + ubifs_err("error %d while processing entry node and " + "trying to find inode node %lu", + err, (unsigned long)inum); + goto out_dump; + } + + /* Count how many direntries or xentries refers this inode */ + fscki->references += 1; + + inum = key_inum_flash(c, &dent->key); + fscki1 = read_add_inode(c, priv, inum); + if (IS_ERR(fscki1)) { + err = PTR_ERR(fscki); + ubifs_err("error %d while processing entry node and " + "trying to find parent inode node %lu", + err, (unsigned long)inum); + goto out_dump; + } + + nlen = le16_to_cpu(dent->nlen); + if (type == UBIFS_XENT_KEY) { + fscki1->calc_xcnt += 1; + fscki1->calc_xsz += CALC_DENT_SIZE(nlen); + fscki1->calc_xsz += CALC_XATTR_BYTES(fscki->size); + fscki1->calc_xnms += nlen; + } else { + fscki1->calc_sz += CALC_DENT_SIZE(nlen); + if (dent->type == UBIFS_ITYPE_DIR) + fscki1->calc_cnt += 1; + } + } + +out: + kfree(node); + return 0; + +out_dump: + ubifs_msg("dump of node at LEB %d:%d", zbr->lnum, zbr->offs); + dbg_dump_node(c, node); +out_free: + kfree(node); + return err; +} + +/** + * free_inodes - free RB-tree of inodes. + * @fsckd: FS checking information + */ +static void free_inodes(struct fsck_data *fsckd) +{ + struct rb_node *this = fsckd->inodes.rb_node; + struct fsck_inode *fscki; + + while (this) { + if (this->rb_left) + this = this->rb_left; + else if (this->rb_right) + this = this->rb_right; + else { + fscki = rb_entry(this, struct fsck_inode, rb); + this = rb_parent(this); + if (this) { + if (this->rb_left == &fscki->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + kfree(fscki); + } + } +} + +/** + * check_inodes - checks all inodes. + * @c: UBIFS file-system description object + * @fsckd: FS checking information + * + * This is a helper function for 'dbg_check_filesystem()' which walks the + * RB-tree of inodes after the index scan has been finished, and checks that + * inode nlink, size, etc are correct. Returns zero if inodes are fine, + * %-EINVAL if not, and a negative error code in case of failure. + */ +static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd) +{ + int n, err; + union ubifs_key key; + struct ubifs_znode *znode; + struct ubifs_zbranch *zbr; + struct ubifs_ino_node *ino; + struct fsck_inode *fscki; + struct rb_node *this = rb_first(&fsckd->inodes); + + while (this) { + fscki = rb_entry(this, struct fsck_inode, rb); + this = rb_next(this); + + if (S_ISDIR(fscki->mode)) { + /* + * Directories have to have exactly one reference (they + * cannot have hardlinks), although root inode is an + * exception. + */ + if (fscki->inum != UBIFS_ROOT_INO && + fscki->references != 1) { + ubifs_err("directory inode %lu has %d " + "direntries which refer it, but " + "should be 1", + (unsigned long)fscki->inum, + fscki->references); + goto out_dump; + } + if (fscki->inum == UBIFS_ROOT_INO && + fscki->references != 0) { + ubifs_err("root inode %lu has non-zero (%d) " + "direntries which refer it", + (unsigned long)fscki->inum, + fscki->references); + goto out_dump; + } + if (fscki->calc_sz != fscki->size) { + ubifs_err("directory inode %lu size is %lld, " + "but calculated size is %lld", + (unsigned long)fscki->inum, + fscki->size, fscki->calc_sz); + goto out_dump; + } + if (fscki->calc_cnt != fscki->nlink) { + ubifs_err("directory inode %lu nlink is %d, " + "but calculated nlink is %d", + (unsigned long)fscki->inum, + fscki->nlink, fscki->calc_cnt); + goto out_dump; + } + } else { + if (fscki->references != fscki->nlink) { + ubifs_err("inode %lu nlink is %d, but " + "calculated nlink is %d", + (unsigned long)fscki->inum, + fscki->nlink, fscki->references); + goto out_dump; + } + } + if (fscki->xattr_sz != fscki->calc_xsz) { + ubifs_err("inode %lu has xattr size %u, but " + "calculated size is %lld", + (unsigned long)fscki->inum, fscki->xattr_sz, + fscki->calc_xsz); + goto out_dump; + } + if (fscki->xattr_cnt != fscki->calc_xcnt) { + ubifs_err("inode %lu has %u xattrs, but " + "calculated count is %lld", + (unsigned long)fscki->inum, + fscki->xattr_cnt, fscki->calc_xcnt); + goto out_dump; + } + if (fscki->xattr_nms != fscki->calc_xnms) { + ubifs_err("inode %lu has xattr names' size %u, but " + "calculated names' size is %lld", + (unsigned long)fscki->inum, fscki->xattr_nms, + fscki->calc_xnms); + goto out_dump; + } + } + + return 0; + +out_dump: + /* Read the bad inode and dump it */ + ino_key_init(c, &key, fscki->inum); + err = ubifs_lookup_level0(c, &key, &znode, &n); + if (!err) { + ubifs_err("inode %lu not found in index", + (unsigned long)fscki->inum); + return -ENOENT; + } else if (err < 0) { + ubifs_err("error %d while looking up inode %lu", + err, (unsigned long)fscki->inum); + return err; + } + + zbr = &znode->zbranch[n]; + ino = kmalloc(zbr->len, GFP_NOFS); + if (!ino) + return -ENOMEM; + + err = ubifs_tnc_read_node(c, zbr, ino); + if (err) { + ubifs_err("cannot read inode node at LEB %d:%d, error %d", + zbr->lnum, zbr->offs, err); + kfree(ino); + return err; + } + + ubifs_msg("dump of the inode %lu sitting in LEB %d:%d", + (unsigned long)fscki->inum, zbr->lnum, zbr->offs); + dbg_dump_node(c, ino); + kfree(ino); + return -EINVAL; +} + +/** + * dbg_check_filesystem - check the file-system. + * @c: UBIFS file-system description object + * + * This function checks the file system, namely: + * o makes sure that all leaf nodes exist and their CRCs are correct; + * o makes sure inode nlink, size, xattr size/count are correct (for all + * inodes). + * + * The function reads whole indexing tree and all nodes, so it is pretty + * heavy-weight. Returns zero if the file-system is consistent, %-EINVAL if + * not, and a negative error code in case of failure. + */ +int dbg_check_filesystem(struct ubifs_info *c) +{ + int err; + struct fsck_data fsckd; + + if (!(ubifs_chk_flags & UBIFS_CHK_FS)) + return 0; + + fsckd.inodes = RB_ROOT; + err = dbg_walk_index(c, check_leaf, NULL, &fsckd); + if (err) + goto out_free; + + err = check_inodes(c, &fsckd); + if (err) + goto out_free; + + free_inodes(&fsckd); + return 0; + +out_free: + ubifs_err("file-system check failed with error %d", err); + dump_stack(); + free_inodes(&fsckd); + return err; +} + +static int invocation_cnt; + +int dbg_force_in_the_gaps(void) +{ + if (!dbg_force_in_the_gaps_enabled) + return 0; + /* Force in-the-gaps every 8th commit */ + return !((invocation_cnt++) & 0x7); +} + +/* Failure mode for recovery testing */ + +#define chance(n, d) (simple_rand() <= (n) * 32768LL / (d)) + +struct failure_mode_info { + struct list_head list; + struct ubifs_info *c; +}; + +static LIST_HEAD(fmi_list); +static DEFINE_SPINLOCK(fmi_lock); + +static unsigned int next; + +static int simple_rand(void) +{ + if (next == 0) + next = current->pid; + next = next * 1103515245 + 12345; + return (next >> 16) & 32767; +} + +static void failure_mode_init(struct ubifs_info *c) +{ + struct failure_mode_info *fmi; + + fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS); + if (!fmi) { + ubifs_err("Failed to register failure mode - no memory"); + return; + } + fmi->c = c; + spin_lock(&fmi_lock); + list_add_tail(&fmi->list, &fmi_list); + spin_unlock(&fmi_lock); +} + +static void failure_mode_exit(struct ubifs_info *c) +{ + struct failure_mode_info *fmi, *tmp; + + spin_lock(&fmi_lock); + list_for_each_entry_safe(fmi, tmp, &fmi_list, list) + if (fmi->c == c) { + list_del(&fmi->list); + kfree(fmi); + } + spin_unlock(&fmi_lock); +} + +static struct ubifs_info *dbg_find_info(struct ubi_volume_desc *desc) +{ + struct failure_mode_info *fmi; + + spin_lock(&fmi_lock); + list_for_each_entry(fmi, &fmi_list, list) + if (fmi->c->ubi == desc) { + struct ubifs_info *c = fmi->c; + + spin_unlock(&fmi_lock); + return c; + } + spin_unlock(&fmi_lock); + return NULL; +} + +static int in_failure_mode(struct ubi_volume_desc *desc) +{ + struct ubifs_info *c = dbg_find_info(desc); + + if (c && dbg_failure_mode) + return c->dbg->failure_mode; + return 0; +} + +static int do_fail(struct ubi_volume_desc *desc, int lnum, int write) +{ + struct ubifs_info *c = dbg_find_info(desc); + struct ubifs_debug_info *d; + + if (!c || !dbg_failure_mode) + return 0; + d = c->dbg; + if (d->failure_mode) + return 1; + if (!d->fail_cnt) { + /* First call - decide delay to failure */ + if (chance(1, 2)) { + unsigned int delay = 1 << (simple_rand() >> 11); + + if (chance(1, 2)) { + d->fail_delay = 1; + d->fail_timeout = jiffies + + msecs_to_jiffies(delay); + dbg_rcvry("failing after %ums", delay); + } else { + d->fail_delay = 2; + d->fail_cnt_max = delay; + dbg_rcvry("failing after %u calls", delay); + } + } + d->fail_cnt += 1; + } + /* Determine if failure delay has expired */ + if (d->fail_delay == 1) { + if (time_before(jiffies, d->fail_timeout)) + return 0; + } else if (d->fail_delay == 2) + if (d->fail_cnt++ < d->fail_cnt_max) + return 0; + if (lnum == UBIFS_SB_LNUM) { + if (write) { + if (chance(1, 2)) + return 0; + } else if (chance(19, 20)) + return 0; + dbg_rcvry("failing in super block LEB %d", lnum); + } else if (lnum == UBIFS_MST_LNUM || lnum == UBIFS_MST_LNUM + 1) { + if (chance(19, 20)) + return 0; + dbg_rcvry("failing in master LEB %d", lnum); + } else if (lnum >= UBIFS_LOG_LNUM && lnum <= c->log_last) { + if (write) { + if (chance(99, 100)) + return 0; + } else if (chance(399, 400)) + return 0; + dbg_rcvry("failing in log LEB %d", lnum); + } else if (lnum >= c->lpt_first && lnum <= c->lpt_last) { + if (write) { + if (chance(7, 8)) + return 0; + } else if (chance(19, 20)) + return 0; + dbg_rcvry("failing in LPT LEB %d", lnum); + } else if (lnum >= c->orph_first && lnum <= c->orph_last) { + if (write) { + if (chance(1, 2)) + return 0; + } else if (chance(9, 10)) + return 0; + dbg_rcvry("failing in orphan LEB %d", lnum); + } else if (lnum == c->ihead_lnum) { + if (chance(99, 100)) + return 0; + dbg_rcvry("failing in index head LEB %d", lnum); + } else if (c->jheads && lnum == c->jheads[GCHD].wbuf.lnum) { + if (chance(9, 10)) + return 0; + dbg_rcvry("failing in GC head LEB %d", lnum); + } else if (write && !RB_EMPTY_ROOT(&c->buds) && + !ubifs_search_bud(c, lnum)) { + if (chance(19, 20)) + return 0; + dbg_rcvry("failing in non-bud LEB %d", lnum); + } else if (c->cmt_state == COMMIT_RUNNING_BACKGROUND || + c->cmt_state == COMMIT_RUNNING_REQUIRED) { + if (chance(999, 1000)) + return 0; + dbg_rcvry("failing in bud LEB %d commit running", lnum); + } else { + if (chance(9999, 10000)) + return 0; + dbg_rcvry("failing in bud LEB %d commit not running", lnum); + } + ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum); + d->failure_mode = 1; + dump_stack(); + return 1; +} + +static void cut_data(const void *buf, int len) +{ + int flen, i; + unsigned char *p = (void *)buf; + + flen = (len * (long long)simple_rand()) >> 15; + for (i = flen; i < len; i++) + p[i] = 0xff; +} + +int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, + int len, int check) +{ + if (in_failure_mode(desc)) + return -EIO; + return ubi_leb_read(desc, lnum, buf, offset, len, check); +} + +int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, + int offset, int len, int dtype) +{ + int err, failing; + + if (in_failure_mode(desc)) + return -EIO; + failing = do_fail(desc, lnum, 1); + if (failing) + cut_data(buf, len); + err = ubi_leb_write(desc, lnum, buf, offset, len, dtype); + if (err) + return err; + if (failing) + return -EIO; + return 0; +} + +int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf, + int len, int dtype) +{ + int err; + + if (do_fail(desc, lnum, 1)) + return -EIO; + err = ubi_leb_change(desc, lnum, buf, len, dtype); + if (err) + return err; + if (do_fail(desc, lnum, 1)) + return -EIO; + return 0; +} + +int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum) +{ + int err; + + if (do_fail(desc, lnum, 0)) + return -EIO; + err = ubi_leb_erase(desc, lnum); + if (err) + return err; + if (do_fail(desc, lnum, 0)) + return -EIO; + return 0; +} + +int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum) +{ + int err; + + if (do_fail(desc, lnum, 0)) + return -EIO; + err = ubi_leb_unmap(desc, lnum); + if (err) + return err; + if (do_fail(desc, lnum, 0)) + return -EIO; + return 0; +} + +int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum) +{ + if (in_failure_mode(desc)) + return -EIO; + return ubi_is_mapped(desc, lnum); +} + +int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype) +{ + int err; + + if (do_fail(desc, lnum, 0)) + return -EIO; + err = ubi_leb_map(desc, lnum, dtype); + if (err) + return err; + if (do_fail(desc, lnum, 0)) + return -EIO; + return 0; +} + +/** + * ubifs_debugging_init - initialize UBIFS debugging. + * @c: UBIFS file-system description object + * + * This function initializes debugging-related data for the file system. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_debugging_init(struct ubifs_info *c) +{ + c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL); + if (!c->dbg) + return -ENOMEM; + + c->dbg->buf = vmalloc(c->leb_size); + if (!c->dbg->buf) + goto out; + + failure_mode_init(c); + return 0; + +out: + kfree(c->dbg); + return -ENOMEM; +} + +/** + * ubifs_debugging_exit - free debugging data. + * @c: UBIFS file-system description object + */ +void ubifs_debugging_exit(struct ubifs_info *c) +{ + failure_mode_exit(c); + vfree(c->dbg->buf); + kfree(c->dbg); +} + +/* + * Root directory for UBIFS stuff in debugfs. Contains sub-directories which + * contain the stuff specific to particular file-system mounts. + */ +static struct dentry *dfs_rootdir; + +/** + * dbg_debugfs_init - initialize debugfs file-system. + * + * UBIFS uses debugfs file-system to expose various debugging knobs to + * user-space. This function creates "ubifs" directory in the debugfs + * file-system. Returns zero in case of success and a negative error code in + * case of failure. + */ +int dbg_debugfs_init(void) +{ + dfs_rootdir = debugfs_create_dir("ubifs", NULL); + if (IS_ERR(dfs_rootdir)) { + int err = PTR_ERR(dfs_rootdir); + ubifs_err("cannot create \"ubifs\" debugfs directory, " + "error %d\n", err); + return err; + } + + return 0; +} + +/** + * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system. + */ +void dbg_debugfs_exit(void) +{ + debugfs_remove(dfs_rootdir); +} + +static int open_debugfs_file(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return 0; +} + +static ssize_t write_debugfs_file(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct ubifs_info *c = file->private_data; + struct ubifs_debug_info *d = c->dbg; + + if (file->f_path.dentry == d->dfs_dump_lprops) + dbg_dump_lprops(c); + else if (file->f_path.dentry == d->dfs_dump_budg) { + spin_lock(&c->space_lock); + dbg_dump_budg(c); + spin_unlock(&c->space_lock); + } else if (file->f_path.dentry == d->dfs_dump_tnc) { + mutex_lock(&c->tnc_mutex); + dbg_dump_tnc(c); + mutex_unlock(&c->tnc_mutex); + } else + return -EINVAL; + + *ppos += count; + return count; +} + +static const struct file_operations dfs_fops = { + .open = open_debugfs_file, + .write = write_debugfs_file, + .owner = THIS_MODULE, +}; + +/** + * dbg_debugfs_init_fs - initialize debugfs for UBIFS instance. + * @c: UBIFS file-system description object + * + * This function creates all debugfs files for this instance of UBIFS. Returns + * zero in case of success and a negative error code in case of failure. + * + * Note, the only reason we have not merged this function with the + * 'ubifs_debugging_init()' function is because it is better to initialize + * debugfs interfaces at the very end of the mount process, and remove them at + * the very beginning of the mount process. + */ +int dbg_debugfs_init_fs(struct ubifs_info *c) +{ + int err; + const char *fname; + struct dentry *dent; + struct ubifs_debug_info *d = c->dbg; + + sprintf(d->dfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id); + d->dfs_dir = debugfs_create_dir(d->dfs_dir_name, dfs_rootdir); + if (IS_ERR(d->dfs_dir)) { + err = PTR_ERR(d->dfs_dir); + ubifs_err("cannot create \"%s\" debugfs directory, error %d\n", + d->dfs_dir_name, err); + goto out; + } + + fname = "dump_lprops"; + dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); + if (IS_ERR(dent)) + goto out_remove; + d->dfs_dump_lprops = dent; + + fname = "dump_budg"; + dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); + if (IS_ERR(dent)) + goto out_remove; + d->dfs_dump_budg = dent; + + fname = "dump_tnc"; + dent = debugfs_create_file(fname, S_IWUGO, d->dfs_dir, c, &dfs_fops); + if (IS_ERR(dent)) + goto out_remove; + d->dfs_dump_tnc = dent; + + return 0; + +out_remove: + err = PTR_ERR(dent); + ubifs_err("cannot create \"%s\" debugfs directory, error %d\n", + fname, err); + if (d->dfs_dump_tnc) + debugfs_remove(d->dfs_dump_tnc); + if (d->dfs_dump_budg) + debugfs_remove(d->dfs_dump_budg); + if (d->dfs_dump_lprops) + debugfs_remove(d->dfs_dump_lprops); + debugfs_remove(d->dfs_dir); +out: + return err; +} + +/** + * dbg_debugfs_exit_fs - remove all debugfs files. + * @c: UBIFS file-system description object + */ +void dbg_debugfs_exit_fs(struct ubifs_info *c) +{ + struct ubifs_debug_info *d = c->dbg; + + debugfs_remove(d->dfs_dump_tnc); + debugfs_remove(d->dfs_dump_budg); + debugfs_remove(d->dfs_dump_lprops); + debugfs_remove(d->dfs_dir); +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff -Nurd linux-2.6.24/fs/ubifs/debug.h ubifs-v2.6.24/fs/ubifs/debug.h --- linux-2.6.24/fs/ubifs/debug.h 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/debug.h 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,507 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +#ifndef __UBIFS_DEBUG_H__ +#define __UBIFS_DEBUG_H__ + +#ifdef CONFIG_UBIFS_FS_DEBUG + +/** + * ubifs_debug_info - per-FS debugging information. + * @buf: a buffer of LEB size, used for various purposes + * @old_zroot: old index root - used by 'dbg_check_old_index()' + * @old_zroot_level: old index root level - used by 'dbg_check_old_index()' + * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()' + * @failure_mode: failure mode for recovery testing + * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls + * @fail_timeout: time in jiffies when delay of failure mode expires + * @fail_cnt: current number of calls to failure mode I/O functions + * @fail_cnt_max: number of calls by which to delay failure mode + * @chk_lpt_sz: used by LPT tree size checker + * @chk_lpt_sz2: used by LPT tree size checker + * @chk_lpt_wastage: used by LPT tree size checker + * @chk_lpt_lebs: used by LPT tree size checker + * @new_nhead_offs: used by LPT tree size checker + * @new_ihead_lnum: used by debugging to check @c->ihead_lnum + * @new_ihead_offs: used by debugging to check @c->ihead_offs + * + * @saved_lst: saved lprops statistics (used by 'dbg_save_space_info()') + * @saved_free: saved free space (used by 'dbg_save_space_info()') + * + * dfs_dir_name: name of debugfs directory containing this file-system's files + * dfs_dir: direntry object of the file-system debugfs directory + * dfs_dump_lprops: "dump lprops" debugfs knob + * dfs_dump_budg: "dump budgeting information" debugfs knob + * dfs_dump_tnc: "dump TNC" debugfs knob + */ +struct ubifs_debug_info { + void *buf; + struct ubifs_zbranch old_zroot; + int old_zroot_level; + unsigned long long old_zroot_sqnum; + int failure_mode; + int fail_delay; + unsigned long fail_timeout; + unsigned int fail_cnt; + unsigned int fail_cnt_max; + long long chk_lpt_sz; + long long chk_lpt_sz2; + long long chk_lpt_wastage; + int chk_lpt_lebs; + int new_nhead_offs; + int new_ihead_lnum; + int new_ihead_offs; + + struct ubifs_lp_stats saved_lst; + long long saved_free; + + char dfs_dir_name[100]; + struct dentry *dfs_dir; + struct dentry *dfs_dump_lprops; + struct dentry *dfs_dump_budg; + struct dentry *dfs_dump_tnc; +}; + +#define ubifs_assert(expr) do { \ + if (unlikely(!(expr))) { \ + printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ + __func__, __LINE__, current->pid); \ + dbg_dump_stack(); \ + } \ +} while (0) + +#define ubifs_assert_cmt_locked(c) do { \ + if (unlikely(down_write_trylock(&(c)->commit_sem))) { \ + up_write(&(c)->commit_sem); \ + printk(KERN_CRIT "commit lock is not locked!\n"); \ + ubifs_assert(0); \ + } \ +} while (0) + +#define dbg_dump_stack() do { \ + if (!dbg_failure_mode) \ + dump_stack(); \ +} while (0) + +/* Generic debugging messages */ +#define dbg_msg(fmt, ...) do { \ + spin_lock(&dbg_lock); \ + printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid, \ + __func__, ##__VA_ARGS__); \ + spin_unlock(&dbg_lock); \ +} while (0) + +#define dbg_do_msg(typ, fmt, ...) do { \ + if (ubifs_msg_flags & typ) \ + dbg_msg(fmt, ##__VA_ARGS__); \ +} while (0) + +#define dbg_err(fmt, ...) do { \ + spin_lock(&dbg_lock); \ + ubifs_err(fmt, ##__VA_ARGS__); \ + spin_unlock(&dbg_lock); \ +} while (0) + +const char *dbg_key_str0(const struct ubifs_info *c, + const union ubifs_key *key); +const char *dbg_key_str1(const struct ubifs_info *c, + const union ubifs_key *key); + +/* + * DBGKEY macros require @dbg_lock to be held, which it is in the dbg message + * macros. + */ +#define DBGKEY(key) dbg_key_str0(c, (key)) +#define DBGKEY1(key) dbg_key_str1(c, (key)) + +/* General messages */ +#define dbg_gen(fmt, ...) dbg_do_msg(UBIFS_MSG_GEN, fmt, ##__VA_ARGS__) + +/* Additional journal messages */ +#define dbg_jnl(fmt, ...) dbg_do_msg(UBIFS_MSG_JNL, fmt, ##__VA_ARGS__) + +/* Additional TNC messages */ +#define dbg_tnc(fmt, ...) dbg_do_msg(UBIFS_MSG_TNC, fmt, ##__VA_ARGS__) + +/* Additional lprops messages */ +#define dbg_lp(fmt, ...) dbg_do_msg(UBIFS_MSG_LP, fmt, ##__VA_ARGS__) + +/* Additional LEB find messages */ +#define dbg_find(fmt, ...) dbg_do_msg(UBIFS_MSG_FIND, fmt, ##__VA_ARGS__) + +/* Additional mount messages */ +#define dbg_mnt(fmt, ...) dbg_do_msg(UBIFS_MSG_MNT, fmt, ##__VA_ARGS__) + +/* Additional I/O messages */ +#define dbg_io(fmt, ...) dbg_do_msg(UBIFS_MSG_IO, fmt, ##__VA_ARGS__) + +/* Additional commit messages */ +#define dbg_cmt(fmt, ...) dbg_do_msg(UBIFS_MSG_CMT, fmt, ##__VA_ARGS__) + +/* Additional budgeting messages */ +#define dbg_budg(fmt, ...) dbg_do_msg(UBIFS_MSG_BUDG, fmt, ##__VA_ARGS__) + +/* Additional log messages */ +#define dbg_log(fmt, ...) dbg_do_msg(UBIFS_MSG_LOG, fmt, ##__VA_ARGS__) + +/* Additional gc messages */ +#define dbg_gc(fmt, ...) dbg_do_msg(UBIFS_MSG_GC, fmt, ##__VA_ARGS__) + +/* Additional scan messages */ +#define dbg_scan(fmt, ...) dbg_do_msg(UBIFS_MSG_SCAN, fmt, ##__VA_ARGS__) + +/* Additional recovery messages */ +#define dbg_rcvry(fmt, ...) dbg_do_msg(UBIFS_MSG_RCVRY, fmt, ##__VA_ARGS__) + +/* + * Debugging message type flags (must match msg_type_names in debug.c). + * + * UBIFS_MSG_GEN: general messages + * UBIFS_MSG_JNL: journal messages + * UBIFS_MSG_MNT: mount messages + * UBIFS_MSG_CMT: commit messages + * UBIFS_MSG_FIND: LEB find messages + * UBIFS_MSG_BUDG: budgeting messages + * UBIFS_MSG_GC: garbage collection messages + * UBIFS_MSG_TNC: TNC messages + * UBIFS_MSG_LP: lprops messages + * UBIFS_MSG_IO: I/O messages + * UBIFS_MSG_LOG: log messages + * UBIFS_MSG_SCAN: scan messages + * UBIFS_MSG_RCVRY: recovery messages + */ +enum { + UBIFS_MSG_GEN = 0x1, + UBIFS_MSG_JNL = 0x2, + UBIFS_MSG_MNT = 0x4, + UBIFS_MSG_CMT = 0x8, + UBIFS_MSG_FIND = 0x10, + UBIFS_MSG_BUDG = 0x20, + UBIFS_MSG_GC = 0x40, + UBIFS_MSG_TNC = 0x80, + UBIFS_MSG_LP = 0x100, + UBIFS_MSG_IO = 0x200, + UBIFS_MSG_LOG = 0x400, + UBIFS_MSG_SCAN = 0x800, + UBIFS_MSG_RCVRY = 0x1000, +}; + +/* Debugging message type flags for each default debug message level */ +#define UBIFS_MSG_LVL_0 0 +#define UBIFS_MSG_LVL_1 0x1 +#define UBIFS_MSG_LVL_2 0x7f +#define UBIFS_MSG_LVL_3 0xffff + +/* + * Debugging check flags (must match chk_names in debug.c). + * + * UBIFS_CHK_GEN: general checks + * UBIFS_CHK_TNC: check TNC + * UBIFS_CHK_IDX_SZ: check index size + * UBIFS_CHK_ORPH: check orphans + * UBIFS_CHK_OLD_IDX: check the old index + * UBIFS_CHK_LPROPS: check lprops + * UBIFS_CHK_FS: check the file-system + */ +enum { + UBIFS_CHK_GEN = 0x1, + UBIFS_CHK_TNC = 0x2, + UBIFS_CHK_IDX_SZ = 0x4, + UBIFS_CHK_ORPH = 0x8, + UBIFS_CHK_OLD_IDX = 0x10, + UBIFS_CHK_LPROPS = 0x20, + UBIFS_CHK_FS = 0x40, +}; + +/* + * Special testing flags (must match tst_names in debug.c). + * + * UBIFS_TST_FORCE_IN_THE_GAPS: force the use of in-the-gaps method + * UBIFS_TST_RCVRY: failure mode for recovery testing + */ +enum { + UBIFS_TST_FORCE_IN_THE_GAPS = 0x2, + UBIFS_TST_RCVRY = 0x4, +}; + +#if CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 1 +#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_1 +#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 2 +#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_2 +#elif CONFIG_UBIFS_FS_DEBUG_MSG_LVL == 3 +#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_3 +#else +#define UBIFS_MSG_FLAGS_DEFAULT UBIFS_MSG_LVL_0 +#endif + +#ifdef CONFIG_UBIFS_FS_DEBUG_CHKS +#define UBIFS_CHK_FLAGS_DEFAULT 0xffffffff +#else +#define UBIFS_CHK_FLAGS_DEFAULT 0 +#endif + +extern spinlock_t dbg_lock; + +extern unsigned int ubifs_msg_flags; +extern unsigned int ubifs_chk_flags; +extern unsigned int ubifs_tst_flags; + +int ubifs_debugging_init(struct ubifs_info *c); +void ubifs_debugging_exit(struct ubifs_info *c); + +/* Dump functions */ +const char *dbg_ntype(int type); +const char *dbg_cstate(int cmt_state); +const char *dbg_get_key_dump(const struct ubifs_info *c, + const union ubifs_key *key); +void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode); +void dbg_dump_node(const struct ubifs_info *c, const void *node); +void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum, + int offs); +void dbg_dump_budget_req(const struct ubifs_budget_req *req); +void dbg_dump_lstats(const struct ubifs_lp_stats *lst); +void dbg_dump_budg(struct ubifs_info *c); +void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp); +void dbg_dump_lprops(struct ubifs_info *c); +void dbg_dump_lpt_info(struct ubifs_info *c); +void dbg_dump_leb(const struct ubifs_info *c, int lnum); +void dbg_dump_znode(const struct ubifs_info *c, + const struct ubifs_znode *znode); +void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat); +void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, int iip); +void dbg_dump_tnc(struct ubifs_info *c); +void dbg_dump_index(struct ubifs_info *c); +void dbg_dump_lpt_lebs(const struct ubifs_info *c); + +/* Checking helper functions */ +typedef int (*dbg_leaf_callback)(struct ubifs_info *c, + struct ubifs_zbranch *zbr, void *priv); +typedef int (*dbg_znode_callback)(struct ubifs_info *c, + struct ubifs_znode *znode, void *priv); +int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, + dbg_znode_callback znode_cb, void *priv); + +/* Checking functions */ +void dbg_save_space_info(struct ubifs_info *c); +int dbg_check_space_info(struct ubifs_info *c); +int dbg_check_lprops(struct ubifs_info *c); +int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot); +int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot); +int dbg_check_cats(struct ubifs_info *c); +int dbg_check_ltab(struct ubifs_info *c); +int dbg_chk_lpt_free_spc(struct ubifs_info *c); +int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len); +int dbg_check_synced_i_size(struct inode *inode); +int dbg_check_dir_size(struct ubifs_info *c, const struct inode *dir); +int dbg_check_tnc(struct ubifs_info *c, int extra); +int dbg_check_idx_size(struct ubifs_info *c, long long idx_size); +int dbg_check_filesystem(struct ubifs_info *c); +void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, + int add_pos); +int dbg_check_lprops(struct ubifs_info *c); +int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, + int row, int col); + +/* Force the use of in-the-gaps method for testing */ + +#define dbg_force_in_the_gaps_enabled \ + (ubifs_tst_flags & UBIFS_TST_FORCE_IN_THE_GAPS) + +int dbg_force_in_the_gaps(void); + +/* Failure mode for recovery testing */ + +#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY) + +#ifndef UBIFS_DBG_PRESERVE_UBI + +#define ubi_leb_read dbg_leb_read +#define ubi_leb_write dbg_leb_write +#define ubi_leb_change dbg_leb_change +#define ubi_leb_erase dbg_leb_erase +#define ubi_leb_unmap dbg_leb_unmap +#define ubi_is_mapped dbg_is_mapped +#define ubi_leb_map dbg_leb_map + +#endif + +int dbg_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset, + int len, int check); +int dbg_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf, + int offset, int len, int dtype); +int dbg_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf, + int len, int dtype); +int dbg_leb_erase(struct ubi_volume_desc *desc, int lnum); +int dbg_leb_unmap(struct ubi_volume_desc *desc, int lnum); +int dbg_is_mapped(struct ubi_volume_desc *desc, int lnum); +int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype); + +static inline int dbg_read(struct ubi_volume_desc *desc, int lnum, char *buf, + int offset, int len) +{ + return dbg_leb_read(desc, lnum, buf, offset, len, 0); +} + +static inline int dbg_write(struct ubi_volume_desc *desc, int lnum, + const void *buf, int offset, int len) +{ + return dbg_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN); +} + +static inline int dbg_change(struct ubi_volume_desc *desc, int lnum, + const void *buf, int len) +{ + return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN); +} + +/* Debugfs-related stuff */ +int dbg_debugfs_init(void); +void dbg_debugfs_exit(void); +int dbg_debugfs_init_fs(struct ubifs_info *c); +void dbg_debugfs_exit_fs(struct ubifs_info *c); + +#else /* !CONFIG_UBIFS_FS_DEBUG */ + +/* Use "if (0)" to make compiler check arguments even if debugging is off */ +#define ubifs_assert(expr) do { \ + if (0 && (expr)) \ + printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \ + __func__, __LINE__, current->pid); \ +} while (0) + +#define dbg_err(fmt, ...) do { \ + if (0) \ + ubifs_err(fmt, ##__VA_ARGS__); \ +} while (0) + +#define dbg_msg(fmt, ...) do { \ + if (0) \ + printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", \ + current->pid, __func__, ##__VA_ARGS__); \ +} while (0) + +#define dbg_dump_stack() +#define ubifs_assert_cmt_locked(c) + +#define dbg_gen(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_jnl(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_tnc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_lp(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_find(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_mnt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_io(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_cmt(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_budg(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_log(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_gc(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_scan(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) +#define dbg_rcvry(fmt, ...) dbg_msg(fmt, ##__VA_ARGS__) + +#define DBGKEY(key) ((char *)(key)) +#define DBGKEY1(key) ((char *)(key)) + +#define ubifs_debugging_init(c) 0 +#define ubifs_debugging_exit(c) ({}) + +#define dbg_ntype(type) "" +#define dbg_cstate(cmt_state) "" +#define dbg_get_key_dump(c, key) ({}) +#define dbg_dump_inode(c, inode) ({}) +#define dbg_dump_node(c, node) ({}) +#define dbg_dump_lpt_node(c, node, lnum, offs) ({}) +#define dbg_dump_budget_req(req) ({}) +#define dbg_dump_lstats(lst) ({}) +#define dbg_dump_budg(c) ({}) +#define dbg_dump_lprop(c, lp) ({}) +#define dbg_dump_lprops(c) ({}) +#define dbg_dump_lpt_info(c) ({}) +#define dbg_dump_leb(c, lnum) ({}) +#define dbg_dump_znode(c, znode) ({}) +#define dbg_dump_heap(c, heap, cat) ({}) +#define dbg_dump_pnode(c, pnode, parent, iip) ({}) +#define dbg_dump_tnc(c) ({}) +#define dbg_dump_index(c) ({}) +#define dbg_dump_lpt_lebs(c) ({}) + +#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0 +#define dbg_old_index_check_init(c, zroot) 0 +#define dbg_save_space_info(c) ({}) +#define dbg_check_space_info(c) 0 +#define dbg_check_old_index(c, zroot) 0 +#define dbg_check_cats(c) 0 +#define dbg_check_ltab(c) 0 +#define dbg_chk_lpt_free_spc(c) 0 +#define dbg_chk_lpt_sz(c, action, len) 0 +#define dbg_check_synced_i_size(inode) 0 +#define dbg_check_dir_size(c, dir) 0 +#define dbg_check_tnc(c, x) 0 +#define dbg_check_idx_size(c, idx_size) 0 +#define dbg_check_filesystem(c) 0 +#define dbg_check_heap(c, heap, cat, add_pos) ({}) +#define dbg_check_lprops(c) 0 +#define dbg_check_lpt_nodes(c, cnode, row, col) 0 +#define dbg_force_in_the_gaps_enabled 0 +#define dbg_force_in_the_gaps() 0 +#define dbg_failure_mode 0 + +#define dbg_debugfs_init() 0 +#define dbg_debugfs_exit() +#define dbg_debugfs_init_fs(c) 0 +#define dbg_debugfs_exit_fs(c) 0 + +#endif /* !CONFIG_UBIFS_FS_DEBUG */ + +/* + * Some compatibility stuff goes here. + */ + +#include + +static inline uint64_t div_u64(uint64_t dividend, uint64_t divisor) +{ + do_div(dividend, divisor); + return dividend; +} + +static inline void ___list_splice(const struct list_head *list, + struct list_head *prev, + struct list_head *next) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + + first->prev = prev; + prev->next = first; + + last->next = next; + next->prev = last; +} + +/* The below has been back-ported from 2.6.28 */ +static inline void list_splice_tail(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) + ___list_splice(list, head->prev, head); +} +#endif /* !__UBIFS_DEBUG_H__ */ diff -Nurd linux-2.6.24/fs/ubifs/dir.c ubifs-v2.6.24/fs/ubifs/dir.c --- linux-2.6.24/fs/ubifs/dir.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/dir.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,1215 @@ +/* * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * Copyright (C) 2006, 2007 University of Szeged, Hungary + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + * Zoltan Sogor + */ + +/* + * This file implements directory operations. + * + * All FS operations in this file allocate budget before writing anything to the + * media. If they fail to allocate it, the error is returned. The only + * exceptions are 'ubifs_unlink()' and 'ubifs_rmdir()' which keep working even + * if they unable to allocate the budget, because deletion %-ENOSPC failure is + * not what users are usually ready to get. UBIFS budgeting subsystem has some + * space reserved for these purposes. + * + * All operations in this file write all inodes which they change straight + * away, instead of marking them dirty. For example, 'ubifs_link()' changes + * @i_size of the parent inode and writes the parent inode together with the + * target inode. This was done to simplify file-system recovery which would + * otherwise be very difficult to do. The only exception is rename which marks + * the re-named inode dirty (because its @i_ctime is updated) but does not + * write it, but just marks it as dirty. + */ + +#include "ubifs.h" + +/** + * inherit_flags - inherit flags of the parent inode. + * @dir: parent inode + * @mode: new inode mode flags + * + * This is a helper function for 'ubifs_new_inode()' which inherits flag of the + * parent directory inode @dir. UBIFS inodes inherit the following flags: + * o %UBIFS_COMPR_FL, which is useful to switch compression on/of on + * sub-directory basis; + * o %UBIFS_SYNC_FL - useful for the same reasons; + * o %UBIFS_DIRSYNC_FL - similar, but relevant only to directories. + * + * This function returns the inherited flags. + */ +static int inherit_flags(const struct inode *dir, int mode) +{ + int flags; + const struct ubifs_inode *ui = ubifs_inode(dir); + + if (!S_ISDIR(dir->i_mode)) + /* + * The parent is not a directory, which means that an extended + * attribute inode is being created. No flags. + */ + return 0; + + flags = ui->flags & (UBIFS_COMPR_FL | UBIFS_SYNC_FL | UBIFS_DIRSYNC_FL); + if (!S_ISDIR(mode)) + /* The "DIRSYNC" flag only applies to directories */ + flags &= ~UBIFS_DIRSYNC_FL; + return flags; +} + +/** + * ubifs_new_inode - allocate new UBIFS inode object. + * @c: UBIFS file-system description object + * @dir: parent directory inode + * @mode: inode mode flags + * + * This function finds an unused inode number, allocates new inode and + * initializes it. Returns new inode in case of success and an error code in + * case of failure. + */ +struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, + int mode) +{ + struct inode *inode; + struct ubifs_inode *ui; + + inode = new_inode(c->vfs_sb); + ui = ubifs_inode(inode); + if (!inode) + return ERR_PTR(-ENOMEM); + + /* + * Set 'S_NOCMTIME' to prevent VFS form updating [mc]time of inodes and + * marking them dirty in file write path (see 'file_update_time()'). + * UBIFS has to fully control "clean <-> dirty" transitions of inodes + * to make budgeting work. + */ + inode->i_flags |= (S_NOCMTIME); + + inode->i_uid = current->fsuid; + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current->fsgid; + inode->i_mode = mode; + inode->i_mtime = inode->i_atime = inode->i_ctime = + ubifs_current_time(inode); + inode->i_mapping->nrpages = 0; + /* Disable readahead */ + inode->i_mapping->backing_dev_info = &c->bdi; + + switch (mode & S_IFMT) { + case S_IFREG: + inode->i_mapping->a_ops = &ubifs_file_address_operations; + inode->i_op = &ubifs_file_inode_operations; + inode->i_fop = &ubifs_file_operations; + break; + case S_IFDIR: + inode->i_op = &ubifs_dir_inode_operations; + inode->i_fop = &ubifs_dir_operations; + inode->i_size = ui->ui_size = UBIFS_INO_NODE_SZ; + break; + case S_IFLNK: + inode->i_op = &ubifs_symlink_inode_operations; + break; + case S_IFSOCK: + case S_IFIFO: + case S_IFBLK: + case S_IFCHR: + inode->i_op = &ubifs_file_inode_operations; + break; + default: + BUG(); + } + + ui->flags = inherit_flags(dir, mode); + ubifs_set_inode_flags(inode); + if (S_ISREG(mode)) + ui->compr_type = c->default_compr; + else + ui->compr_type = UBIFS_COMPR_NONE; + ui->synced_i_size = 0; + + spin_lock(&c->cnt_lock); + /* Inode number overflow is currently not supported */ + if (c->highest_inum >= INUM_WARN_WATERMARK) { + if (c->highest_inum >= INUM_WATERMARK) { + spin_unlock(&c->cnt_lock); + ubifs_err("out of inode numbers"); + make_bad_inode(inode); + iput(inode); + return ERR_PTR(-EINVAL); + } + ubifs_warn("running out of inode numbers (current %lu, max %d)", + (unsigned long)c->highest_inum, INUM_WATERMARK); + } + + inode->i_ino = ++c->highest_inum; + /* + * The creation sequence number remains with this inode for its + * lifetime. All nodes for this inode have a greater sequence number, + * and so it is possible to distinguish obsolete nodes belonging to a + * previous incarnation of the same inode number - for example, for the + * purpose of rebuilding the index. + */ + ui->creat_sqnum = ++c->max_sqnum; + spin_unlock(&c->cnt_lock); + return inode; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +static int dbg_check_name(struct ubifs_dent_node *dent, struct qstr *nm) +{ + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + if (le16_to_cpu(dent->nlen) != nm->len) + return -EINVAL; + if (memcmp(dent->name, nm->name, nm->len)) + return -EINVAL; + return 0; +} + +#else + +#define dbg_check_name(dent, nm) 0 + +#endif + +static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + int err; + union ubifs_key key; + struct inode *inode = NULL; + struct ubifs_dent_node *dent; + struct ubifs_info *c = dir->i_sb->s_fs_info; + + dbg_gen("'%.*s' in dir ino %lu", + dentry->d_name.len, dentry->d_name.name, dir->i_ino); + + if (dentry->d_name.len > UBIFS_MAX_NLEN) + return ERR_PTR(-ENAMETOOLONG); + + dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); + if (!dent) + return ERR_PTR(-ENOMEM); + + dent_key_init(c, &key, dir->i_ino, &dentry->d_name); + + err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name); + if (err) { + /* + * Do not hash the direntry if parent 'i_nlink' is zero, because + * this has side-effects - '->delete_inode()' call will not be + * called for the parent orphan inode, because 'd_count' of its + * direntry will stay 1 (it'll be negative direntry I guess) + * and prevent 'iput_final()' until the dentry is destroyed due + * to unmount or memory pressure. + */ + if (err == -ENOENT && dir->i_nlink != 0) { + dbg_gen("not found"); + goto done; + } + goto out; + } + + if (dbg_check_name(dent, &dentry->d_name)) { + err = -EINVAL; + goto out; + } + + inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum)); + if (IS_ERR(inode)) { + /* + * This should not happen. Probably the file-system needs + * checking. + */ + err = PTR_ERR(inode); + ubifs_err("dead directory entry '%.*s', error %d", + dentry->d_name.len, dentry->d_name.name, err); + ubifs_ro_mode(c, err); + goto out; + } + +done: + kfree(dent); + return d_splice_alias(inode, dentry); + +out: + kfree(dent); + return ERR_PTR(err); +} + +static int ubifs_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) +{ + struct inode *inode; + struct ubifs_info *c = dir->i_sb->s_fs_info; + int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .dirtied_ino = 1 }; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + + /* + * Budget request settings: new inode, new direntry, changing the + * parent directory inode. + */ + + dbg_gen("dent '%.*s', mode %#x in dir ino %lu", + dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino); + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + inode = ubifs_new_inode(c, dir, mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_budg; + } + + mutex_lock(&dir_ui->ui_mutex); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) + goto out_cancel; + mutex_unlock(&dir_ui->ui_mutex); + + ubifs_release_budget(c, &req); + insert_inode_hash(inode); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + mutex_unlock(&dir_ui->ui_mutex); + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + ubifs_err("cannot create regular file, error %d", err); + return err; +} + +/** + * vfs_dent_type - get VFS directory entry type. + * @type: UBIFS directory entry type + * + * This function converts UBIFS directory entry type into VFS directory entry + * type. + */ +static unsigned int vfs_dent_type(uint8_t type) +{ + switch (type) { + case UBIFS_ITYPE_REG: + return DT_REG; + case UBIFS_ITYPE_DIR: + return DT_DIR; + case UBIFS_ITYPE_LNK: + return DT_LNK; + case UBIFS_ITYPE_BLK: + return DT_BLK; + case UBIFS_ITYPE_CHR: + return DT_CHR; + case UBIFS_ITYPE_FIFO: + return DT_FIFO; + case UBIFS_ITYPE_SOCK: + return DT_SOCK; + default: + BUG(); + } + return 0; +} + +/* + * The classical Unix view for directory is that it is a linear array of + * (name, inode number) entries. Linux/VFS assumes this model as well. + * Particularly, 'readdir()' call wants us to return a directory entry offset + * which later may be used to continue 'readdir()'ing the directory or to + * 'seek()' to that specific direntry. Obviously UBIFS does not really fit this + * model because directory entries are identified by keys, which may collide. + * + * UBIFS uses directory entry hash value for directory offsets, so + * 'seekdir()'/'telldir()' may not always work because of possible key + * collisions. But UBIFS guarantees that consecutive 'readdir()' calls work + * properly by means of saving full directory entry name in the private field + * of the file description object. + * + * This means that UBIFS cannot support NFS which requires full + * 'seekdir()'/'telldir()' support. + */ +static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + int err, over = 0; + struct qstr nm; + union ubifs_key key; + struct ubifs_dent_node *dent; + struct inode *dir = file->f_path.dentry->d_inode; + struct ubifs_info *c = dir->i_sb->s_fs_info; + + dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos); + + if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2) + /* + * The directory was seek'ed to a senseless position or there + * are no more entries. + */ + return 0; + + /* File positions 0 and 1 correspond to "." and ".." */ + if (file->f_pos == 0) { + ubifs_assert(!file->private_data); + over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR); + if (over) + return 0; + file->f_pos = 1; + } + + if (file->f_pos == 1) { + ubifs_assert(!file->private_data); + over = filldir(dirent, "..", 2, 1, + parent_ino(file->f_path.dentry), DT_DIR); + if (over) + return 0; + + /* Find the first entry in TNC and save it */ + lowest_dent_key(c, &key, dir->i_ino); + nm.name = NULL; + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + goto out; + } + + file->f_pos = key_hash_flash(c, &dent->key); + file->private_data = dent; + } + + dent = file->private_data; + if (!dent) { + /* + * The directory was seek'ed to and is now readdir'ed. + * Find the entry corresponding to @file->f_pos or the + * closest one. + */ + dent_key_init_hash(c, &key, dir->i_ino, file->f_pos); + nm.name = NULL; + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + goto out; + } + file->f_pos = key_hash_flash(c, &dent->key); + file->private_data = dent; + } + + while (1) { + dbg_gen("feed '%s', ino %llu, new f_pos %#x", + dent->name, (unsigned long long)le64_to_cpu(dent->inum), + key_hash_flash(c, &dent->key)); + ubifs_assert(le64_to_cpu(dent->ch.sqnum) > + ubifs_inode(dir)->creat_sqnum); + + nm.len = le16_to_cpu(dent->nlen); + over = filldir(dirent, dent->name, nm.len, file->f_pos, + le64_to_cpu(dent->inum), + vfs_dent_type(dent->type)); + if (over) + return 0; + + /* Switch to the next entry */ + key_read(c, &dent->key, &key); + nm.name = dent->name; + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + goto out; + } + + kfree(file->private_data); + file->f_pos = key_hash_flash(c, &dent->key); + file->private_data = dent; + cond_resched(); + } + +out: + if (err != -ENOENT) { + ubifs_err("cannot find next direntry, error %d", err); + return err; + } + + kfree(file->private_data); + file->private_data = NULL; + file->f_pos = 2; + return 0; +} + +/* If a directory is seeked, we have to free saved readdir() state */ +static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin) +{ + kfree(file->private_data); + file->private_data = NULL; + return generic_file_llseek(file, offset, origin); +} + +/* Free saved readdir() state when the directory is closed */ +static int ubifs_dir_release(struct inode *dir, struct file *file) +{ + kfree(file->private_data); + file->private_data = NULL; + return 0; +} + +/** + * lock_2_inodes - a wrapper for locking two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + * + * We do not implement any tricks to guarantee strict lock ordering, because + * VFS has already done it for us on the @i_mutex. So this is just a simple + * wrapper function. + */ +static void lock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); + mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); +} + +/** + * unlock_2_inodes - a wrapper for unlocking two UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + */ +static void unlock_2_inodes(struct inode *inode1, struct inode *inode2) +{ + mutex_unlock(&ubifs_inode(inode2)->ui_mutex); + mutex_unlock(&ubifs_inode(inode1)->ui_mutex); +} + +static int ubifs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct inode *inode = old_dentry->d_inode; + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_inode *dir_ui = ubifs_inode(dir); + int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); + struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2, + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; + + /* + * Budget request settings: new direntry, changing the target inode, + * changing the parent inode. + */ + + dbg_gen("dent '%.*s' to ino %lu (nlink %d) in dir ino %lu", + dentry->d_name.len, dentry->d_name.name, inode->i_ino, + inode->i_nlink, dir->i_ino); + ubifs_assert(mutex_is_locked(&dir->i_mutex)); + ubifs_assert(mutex_is_locked(&inode->i_mutex)); + err = dbg_check_synced_i_size(inode); + if (err) + return err; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + lock_2_inodes(dir, inode); + inc_nlink(inode); + atomic_inc(&inode->i_count); + inode->i_ctime = ubifs_current_time(inode); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) + goto out_cancel; + unlock_2_inodes(dir, inode); + + ubifs_release_budget(c, &req); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + drop_nlink(inode); + unlock_2_inodes(dir, inode); + ubifs_release_budget(c, &req); + iput(inode); + return err; +} + +static int ubifs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct inode *inode = dentry->d_inode; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int err, budgeted = 1; + struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; + + /* + * Budget request settings: deletion direntry, deletion inode (+1 for + * @dirtied_ino), changing the parent directory inode. If budgeting + * fails, go ahead anyway because we have extra space reserved for + * deletions. + */ + + dbg_gen("dent '%.*s' from ino %lu (nlink %d) in dir ino %lu", + dentry->d_name.len, dentry->d_name.name, inode->i_ino, + inode->i_nlink, dir->i_ino); + ubifs_assert(mutex_is_locked(&dir->i_mutex)); + ubifs_assert(mutex_is_locked(&inode->i_mutex)); + err = dbg_check_synced_i_size(inode); + if (err) + return err; + + err = ubifs_budget_space(c, &req); + if (err) { + if (err != -ENOSPC) + return err; + budgeted = 0; + } + + lock_2_inodes(dir, inode); + inode->i_ctime = ubifs_current_time(dir); + drop_nlink(inode); + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0); + if (err) + goto out_cancel; + unlock_2_inodes(dir, inode); + + if (budgeted) + ubifs_release_budget(c, &req); + else { + /* We've deleted something - clean the "no space" flags */ + c->nospace = c->nospace_rp = 0; + smp_wmb(); + } + return 0; + +out_cancel: + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + inc_nlink(inode); + unlock_2_inodes(dir, inode); + if (budgeted) + ubifs_release_budget(c, &req); + return err; +} + +/** + * check_dir_empty - check if a directory is empty or not. + * @c: UBIFS file-system description object + * @dir: VFS inode object of the directory to check + * + * This function checks if directory @dir is empty. Returns zero if the + * directory is empty, %-ENOTEMPTY if it is not, and other negative error codes + * in case of of errors. + */ +static int check_dir_empty(struct ubifs_info *c, struct inode *dir) +{ + struct qstr nm = { .name = NULL }; + struct ubifs_dent_node *dent; + union ubifs_key key; + int err; + + lowest_dent_key(c, &key, dir->i_ino); + dent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(dent)) { + err = PTR_ERR(dent); + if (err == -ENOENT) + err = 0; + } else { + kfree(dent); + err = -ENOTEMPTY; + } + return err; +} + +static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct inode *inode = dentry->d_inode; + int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int err, budgeted = 1; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; + + /* + * Budget request settings: deletion direntry, deletion inode and + * changing the parent inode. If budgeting fails, go ahead anyway + * because we have extra space reserved for deletions. + */ + + dbg_gen("directory '%.*s', ino %lu in dir ino %lu", dentry->d_name.len, + dentry->d_name.name, inode->i_ino, dir->i_ino); + ubifs_assert(mutex_is_locked(&dir->i_mutex)); + ubifs_assert(mutex_is_locked(&inode->i_mutex)); + err = check_dir_empty(c, dentry->d_inode); + if (err) + return err; + + err = ubifs_budget_space(c, &req); + if (err) { + if (err != -ENOSPC) + return err; + budgeted = 0; + } + + lock_2_inodes(dir, inode); + inode->i_ctime = ubifs_current_time(dir); + clear_nlink(inode); + drop_nlink(dir); + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0); + if (err) + goto out_cancel; + unlock_2_inodes(dir, inode); + + if (budgeted) + ubifs_release_budget(c, &req); + else { + /* We've deleted something - clean the "no space" flags */ + c->nospace = c->nospace_rp = 0; + smp_wmb(); + } + return 0; + +out_cancel: + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + inc_nlink(dir); + inc_nlink(inode); + inc_nlink(inode); + unlock_2_inodes(dir, inode); + if (budgeted) + ubifs_release_budget(c, &req); + return err; +} + +static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_info *c = dir->i_sb->s_fs_info; + int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 }; + + /* + * Budget request settings: new inode, new direntry and changing parent + * directory inode. + */ + + dbg_gen("dent '%.*s', mode %#x in dir ino %lu", + dentry->d_name.len, dentry->d_name.name, mode, dir->i_ino); + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + inode = ubifs_new_inode(c, dir, S_IFDIR | mode); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_budg; + } + + mutex_lock(&dir_ui->ui_mutex); + insert_inode_hash(inode); + inc_nlink(inode); + inc_nlink(dir); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) { + ubifs_err("cannot create directory, error %d", err); + goto out_cancel; + } + mutex_unlock(&dir_ui->ui_mutex); + + ubifs_release_budget(c, &req); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + drop_nlink(dir); + mutex_unlock(&dir_ui->ui_mutex); + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + return err; +} + +static int ubifs_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + struct inode *inode; + struct ubifs_inode *ui; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_info *c = dir->i_sb->s_fs_info; + union ubifs_dev_desc *dev = NULL; + int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int err, devlen = 0; + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .new_ino_d = ALIGN(devlen, 8), + .dirtied_ino = 1 }; + + /* + * Budget request settings: new inode, new direntry and changing parent + * directory inode. + */ + + dbg_gen("dent '%.*s' in dir ino %lu", + dentry->d_name.len, dentry->d_name.name, dir->i_ino); + + if (!new_valid_dev(rdev)) + return -EINVAL; + + if (S_ISBLK(mode) || S_ISCHR(mode)) { + dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); + if (!dev) + return -ENOMEM; + devlen = ubifs_encode_dev(dev, rdev); + } + + err = ubifs_budget_space(c, &req); + if (err) { + kfree(dev); + return err; + } + + inode = ubifs_new_inode(c, dir, mode); + if (IS_ERR(inode)) { + kfree(dev); + err = PTR_ERR(inode); + goto out_budg; + } + + init_special_inode(inode, inode->i_mode, rdev); + inode->i_size = ubifs_inode(inode)->ui_size = devlen; + ui = ubifs_inode(inode); + ui->data = dev; + ui->data_len = devlen; + + mutex_lock(&dir_ui->ui_mutex); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) + goto out_cancel; + mutex_unlock(&dir_ui->ui_mutex); + + ubifs_release_budget(c, &req); + insert_inode_hash(inode); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + mutex_unlock(&dir_ui->ui_mutex); + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + return err; +} + +static int ubifs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct inode *inode; + struct ubifs_inode *ui; + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_info *c = dir->i_sb->s_fs_info; + int err, len = strlen(symname); + int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .new_ino_d = ALIGN(len, 8), + .dirtied_ino = 1 }; + + /* + * Budget request settings: new inode, new direntry and changing parent + * directory inode. + */ + + dbg_gen("dent '%.*s', target '%s' in dir ino %lu", dentry->d_name.len, + dentry->d_name.name, symname, dir->i_ino); + + if (len > UBIFS_MAX_INO_DATA) + return -ENAMETOOLONG; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_budg; + } + + ui = ubifs_inode(inode); + ui->data = kmalloc(len + 1, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_inode; + } + + memcpy(ui->data, symname, len); + ((char *)ui->data)[len] = '\0'; + /* + * The terminating zero byte is not written to the flash media and it + * is put just to make later in-memory string processing simpler. Thus, + * data length is @len, not @len + %1. + */ + ui->data_len = len; + inode->i_size = ubifs_inode(inode)->ui_size = len; + + mutex_lock(&dir_ui->ui_mutex); + dir->i_size += sz_change; + dir_ui->ui_size = dir->i_size; + dir->i_mtime = dir->i_ctime = inode->i_ctime; + err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + if (err) + goto out_cancel; + mutex_unlock(&dir_ui->ui_mutex); + + ubifs_release_budget(c, &req); + insert_inode_hash(inode); + d_instantiate(dentry, inode); + return 0; + +out_cancel: + dir->i_size -= sz_change; + dir_ui->ui_size = dir->i_size; + mutex_unlock(&dir_ui->ui_mutex); +out_inode: + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + return err; +} + +/** + * lock_3_inodes - a wrapper for locking three UBIFS inodes. + * @inode1: first inode + * @inode2: second inode + * @inode3: third inode + * + * This function is used for 'ubifs_rename()' and @inode1 may be the same as + * @inode2 whereas @inode3 may be %NULL. + * + * We do not implement any tricks to guarantee strict lock ordering, because + * VFS has already done it for us on the @i_mutex. So this is just a simple + * wrapper function. + */ +static void lock_3_inodes(struct inode *inode1, struct inode *inode2, + struct inode *inode3) +{ + mutex_lock_nested(&ubifs_inode(inode1)->ui_mutex, WB_MUTEX_1); + if (inode2 != inode1) + mutex_lock_nested(&ubifs_inode(inode2)->ui_mutex, WB_MUTEX_2); + if (inode3) + mutex_lock_nested(&ubifs_inode(inode3)->ui_mutex, WB_MUTEX_3); +} + +/** + * unlock_3_inodes - a wrapper for unlocking three UBIFS inodes for rename. + * @inode1: first inode + * @inode2: second inode + * @inode3: third inode + */ +static void unlock_3_inodes(struct inode *inode1, struct inode *inode2, + struct inode *inode3) +{ + if (inode3) + mutex_unlock(&ubifs_inode(inode3)->ui_mutex); + if (inode1 != inode2) + mutex_unlock(&ubifs_inode(inode2)->ui_mutex); + mutex_unlock(&ubifs_inode(inode1)->ui_mutex); +} + +static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct ubifs_info *c = old_dir->i_sb->s_fs_info; + struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = new_dentry->d_inode; + struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode); + int err, release, sync = 0, move = (new_dir != old_dir); + int is_dir = S_ISDIR(old_inode->i_mode); + int unlink = !!new_inode; + int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len); + int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len); + struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1, + .dirtied_ino = 3 }; + struct ubifs_budget_req ino_req = { .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; + struct timespec time; + + /* + * Budget request settings: deletion direntry, new direntry, removing + * the old inode, and changing old and new parent directory inodes. + * + * However, this operation also marks the target inode as dirty and + * does not write it, so we allocate budget for the target inode + * separately. + */ + + dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in " + "dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name, + old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len, + new_dentry->d_name.name, new_dir->i_ino); + ubifs_assert(mutex_is_locked(&old_dir->i_mutex)); + ubifs_assert(mutex_is_locked(&new_dir->i_mutex)); + if (unlink) + ubifs_assert(mutex_is_locked(&new_inode->i_mutex)); + + + if (unlink && is_dir) { + err = check_dir_empty(c, new_inode); + if (err) + return err; + } + + err = ubifs_budget_space(c, &req); + if (err) + return err; + err = ubifs_budget_space(c, &ino_req); + if (err) { + ubifs_release_budget(c, &req); + return err; + } + + lock_3_inodes(old_dir, new_dir, new_inode); + + /* + * Like most other Unix systems, set the @i_ctime for inodes on a + * rename. + */ + time = ubifs_current_time(old_dir); + old_inode->i_ctime = time; + + /* We must adjust parent link count when renaming directories */ + if (is_dir) { + if (move) { + /* + * @old_dir loses a link because we are moving + * @old_inode to a different directory. + */ + drop_nlink(old_dir); + /* + * @new_dir only gains a link if we are not also + * overwriting an existing directory. + */ + if (!unlink) + inc_nlink(new_dir); + } else { + /* + * @old_inode is not moving to a different directory, + * but @old_dir still loses a link if we are + * overwriting an existing directory. + */ + if (unlink) + drop_nlink(old_dir); + } + } + + old_dir->i_size -= old_sz; + ubifs_inode(old_dir)->ui_size = old_dir->i_size; + old_dir->i_mtime = old_dir->i_ctime = time; + new_dir->i_mtime = new_dir->i_ctime = time; + + /* + * And finally, if we unlinked a direntry which happened to have the + * same name as the moved direntry, we have to decrement @i_nlink of + * the unlinked inode and change its ctime. + */ + if (unlink) { + /* + * Directories cannot have hard-links, so if this is a + * directory, decrement its @i_nlink twice because an empty + * directory has @i_nlink 2. + */ + if (is_dir) + drop_nlink(new_inode); + new_inode->i_ctime = time; + drop_nlink(new_inode); + } else { + new_dir->i_size += new_sz; + ubifs_inode(new_dir)->ui_size = new_dir->i_size; + } + + /* + * Do not ask 'ubifs_jnl_rename()' to flush write-buffer if @old_inode + * is dirty, because this will be done later on at the end of + * 'ubifs_rename()'. + */ + if (IS_SYNC(old_inode)) { + sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir); + if (unlink && IS_SYNC(new_inode)) + sync = 1; + } + err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry, + sync); + if (err) + goto out_cancel; + + unlock_3_inodes(old_dir, new_dir, new_inode); + ubifs_release_budget(c, &req); + + mutex_lock(&old_inode_ui->ui_mutex); + release = old_inode_ui->dirty; + mark_inode_dirty_sync(old_inode); + mutex_unlock(&old_inode_ui->ui_mutex); + + if (release) + ubifs_release_budget(c, &ino_req); + if (IS_SYNC(old_inode)) + err = old_inode->i_sb->s_op->write_inode(old_inode, 1); + return err; + +out_cancel: + if (unlink) { + if (is_dir) + inc_nlink(new_inode); + inc_nlink(new_inode); + } else { + new_dir->i_size -= new_sz; + ubifs_inode(new_dir)->ui_size = new_dir->i_size; + } + old_dir->i_size += old_sz; + ubifs_inode(old_dir)->ui_size = old_dir->i_size; + if (is_dir) { + if (move) { + inc_nlink(old_dir); + if (!unlink) + drop_nlink(new_dir); + } else { + if (unlink) + inc_nlink(old_dir); + } + } + unlock_3_inodes(old_dir, new_dir, new_inode); + ubifs_release_budget(c, &ino_req); + ubifs_release_budget(c, &req); + return err; +} + +int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + loff_t size; + struct inode *inode = dentry->d_inode; + struct ubifs_inode *ui = ubifs_inode(inode); + + mutex_lock(&ui->ui_mutex); + stat->dev = inode->i_sb->s_dev; + stat->ino = inode->i_ino; + stat->mode = inode->i_mode; + stat->nlink = inode->i_nlink; + stat->uid = inode->i_uid; + stat->gid = inode->i_gid; + stat->rdev = inode->i_rdev; + stat->atime = inode->i_atime; + stat->mtime = inode->i_mtime; + stat->ctime = inode->i_ctime; + stat->blksize = UBIFS_BLOCK_SIZE; + stat->size = ui->ui_size; + + /* + * Unfortunately, the 'stat()' system call was designed for block + * device based file systems, and it is not appropriate for UBIFS, + * because UBIFS does not have notion of "block". For example, it is + * difficult to tell how many block a directory takes - it actually + * takes less than 300 bytes, but we have to round it to block size, + * which introduces large mistake. This makes utilities like 'du' to + * report completely senseless numbers. This is the reason why UBIFS + * goes the same way as JFFS2 - it reports zero blocks for everything + * but regular files, which makes more sense than reporting completely + * wrong sizes. + */ + if (S_ISREG(inode->i_mode)) { + size = ui->xattr_size; + size += stat->size; + size = ALIGN(size, UBIFS_BLOCK_SIZE); + /* + * Note, user-space expects 512-byte blocks count irrespectively + * of what was reported in @stat->size. + */ + stat->blocks = size >> 9; + } else + stat->blocks = 0; + mutex_unlock(&ui->ui_mutex); + return 0; +} + +const struct inode_operations ubifs_dir_inode_operations = { + .lookup = ubifs_lookup, + .create = ubifs_create, + .link = ubifs_link, + .symlink = ubifs_symlink, + .unlink = ubifs_unlink, + .mkdir = ubifs_mkdir, + .rmdir = ubifs_rmdir, + .mknod = ubifs_mknod, + .rename = ubifs_rename, + .setattr = ubifs_setattr, + .getattr = ubifs_getattr, +#ifdef CONFIG_UBIFS_FS_XATTR + .setxattr = ubifs_setxattr, + .getxattr = ubifs_getxattr, + .listxattr = ubifs_listxattr, + .removexattr = ubifs_removexattr, +#endif +}; + +const struct file_operations ubifs_dir_operations = { + .llseek = ubifs_dir_llseek, + .release = ubifs_dir_release, + .read = generic_read_dir, + .readdir = ubifs_readdir, + .fsync = ubifs_fsync, + .unlocked_ioctl = ubifs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ubifs_compat_ioctl, +#endif +}; diff -Nurd linux-2.6.24/fs/ubifs/file.c ubifs-v2.6.24/fs/ubifs/file.c --- linux-2.6.24/fs/ubifs/file.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/file.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,1593 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements VFS file and inode operations of regular files, device + * nodes and symlinks as well as address space operations. + * + * UBIFS uses 2 page flags: PG_private and PG_checked. PG_private is set if the + * page is dirty and is used for budgeting purposes - dirty pages should not be + * budgeted. The PG_checked flag is set if full budgeting is required for the + * page e.g., when it corresponds to a file hole or it is just beyond the file + * size. The budgeting is done in 'ubifs_write_begin()', because it is OK to + * fail in this function, and the budget is released in 'ubifs_write_end()'. So + * the PG_private and PG_checked flags carry the information about how the page + * was budgeted, to make it possible to release the budget properly. + * + * A thing to keep in mind: inode's 'i_mutex' is locked in most VFS operations + * we implement. However, this is not true for '->writepage()', which might be + * called with 'i_mutex' unlocked. For example, when pdflush is performing + * write-back, it calls 'writepage()' with unlocked 'i_mutex', although the + * inode has 'I_LOCK' flag in this case. At "normal" work-paths 'i_mutex' is + * locked in '->writepage', e.g. in "sys_write -> alloc_pages -> direct reclaim + * path'. So, in '->writepage()' we are only guaranteed that the page is + * locked. + * + * Similarly, 'i_mutex' does not have to be locked in readpage(), e.g., + * readahead path does not have it locked ("sys_read -> generic_file_aio_read + * -> ondemand_readahead -> readpage"). In case of readahead, 'I_LOCK' flag is + * not set as well. However, UBIFS disables readahead. + * + * This, for example means that there might be 2 concurrent '->writepage()' + * calls for the same inode, but different inode dirty pages. + */ + +#include "ubifs.h" +#include + +static int read_block(struct inode *inode, void *addr, unsigned int block, + struct ubifs_data_node *dn) +{ + struct ubifs_info *c = inode->i_sb->s_fs_info; + int err, len, out_len; + union ubifs_key key; + unsigned int dlen; + + data_key_init(c, &key, inode->i_ino, block); + err = ubifs_tnc_lookup(c, &key, dn); + if (err) { + if (err == -ENOENT) + /* Not found, so it must be a hole */ + memset(addr, 0, UBIFS_BLOCK_SIZE); + return err; + } + + ubifs_assert(le64_to_cpu(dn->ch.sqnum) > + ubifs_inode(inode)->creat_sqnum); + len = le32_to_cpu(dn->size); + if (len <= 0 || len > UBIFS_BLOCK_SIZE) + goto dump; + + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + out_len = UBIFS_BLOCK_SIZE; + err = ubifs_decompress(&dn->data, dlen, addr, &out_len, + le16_to_cpu(dn->compr_type)); + if (err || len != out_len) + goto dump; + + /* + * Data length can be less than a full block, even for blocks that are + * not the last in the file (e.g., as a result of making a hole and + * appending data). Ensure that the remainder is zeroed out. + */ + if (len < UBIFS_BLOCK_SIZE) + memset(addr + len, 0, UBIFS_BLOCK_SIZE - len); + + return 0; + +dump: + ubifs_err("bad data node (block %u, inode %lu)", + block, inode->i_ino); + dbg_dump_node(c, dn); + return -EINVAL; +} + +static int do_readpage(struct page *page) +{ + void *addr; + int err = 0, i; + unsigned int block, beyond; + struct ubifs_data_node *dn; + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + + dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", + inode->i_ino, page->index, i_size, page->flags); + ubifs_assert(!PageChecked(page)); + ubifs_assert(!PagePrivate(page)); + + addr = kmap(page); + + block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT; + if (block >= beyond) { + /* Reading beyond inode */ + SetPageChecked(page); + memset(addr, 0, PAGE_CACHE_SIZE); + goto out; + } + + dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS); + if (!dn) { + err = -ENOMEM; + goto error; + } + + i = 0; + while (1) { + int ret; + + if (block >= beyond) { + /* Reading beyond inode */ + err = -ENOENT; + memset(addr, 0, UBIFS_BLOCK_SIZE); + } else { + ret = read_block(inode, addr, block, dn); + if (ret) { + err = ret; + if (err != -ENOENT) + break; + } else if (block + 1 == beyond) { + int dlen = le32_to_cpu(dn->size); + int ilen = i_size & (UBIFS_BLOCK_SIZE - 1); + + if (ilen && ilen < dlen) + memset(addr + ilen, 0, dlen - ilen); + } + } + if (++i >= UBIFS_BLOCKS_PER_PAGE) + break; + block += 1; + addr += UBIFS_BLOCK_SIZE; + } + if (err) { + if (err == -ENOENT) { + /* Not found, so it must be a hole */ + SetPageChecked(page); + dbg_gen("hole"); + goto out_free; + } + ubifs_err("cannot read page %lu of inode %lu, error %d", + page->index, inode->i_ino, err); + goto error; + } + +out_free: + kfree(dn); +out: + SetPageUptodate(page); + ClearPageError(page); + flush_dcache_page(page); + kunmap(page); + return 0; + +error: + kfree(dn); + ClearPageUptodate(page); + SetPageError(page); + flush_dcache_page(page); + kunmap(page); + return err; +} + +/** + * release_new_page_budget - release budget of a new page. + * @c: UBIFS file-system description object + * + * This is a helper function which releases budget corresponding to the budget + * of one new page of data. + */ +static void release_new_page_budget(struct ubifs_info *c) +{ + struct ubifs_budget_req req = { .recalculate = 1, .new_page = 1 }; + + ubifs_release_budget(c, &req); +} + +/** + * release_existing_page_budget - release budget of an existing page. + * @c: UBIFS file-system description object + * + * This is a helper function which releases budget corresponding to the budget + * of changing one one page of data which already exists on the flash media. + */ +static void release_existing_page_budget(struct ubifs_info *c) +{ + struct ubifs_budget_req req = { .dd_growth = c->page_budget}; + + ubifs_release_budget(c, &req); +} + +static int write_begin_slow(struct address_space *mapping, + loff_t pos, unsigned len, struct page **pagep) +{ + struct inode *inode = mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct ubifs_budget_req req = { .new_page = 1 }; + int uninitialized_var(err), appending = !!(pos + len > inode->i_size); + struct page *page; + + dbg_gen("ino %lu, pos %llu, len %u, i_size %lld", + inode->i_ino, pos, len, inode->i_size); + + /* + * At the slow path we have to budget before locking the page, because + * budgeting may force write-back, which would wait on locked pages and + * deadlock if we had the page locked. At this point we do not know + * anything about the page, so assume that this is a new page which is + * written to a hole. This corresponds to largest budget. Later the + * budget will be amended if this is not true. + */ + if (appending) + /* We are appending data, budget for inode change */ + req.dirtied_ino = 1; + + err = ubifs_budget_space(c, &req); + if (unlikely(err)) + return err; + + page = __grab_cache_page(mapping, index); + if (unlikely(!page)) { + ubifs_release_budget(c, &req); + return -ENOMEM; + } + + if (!PageUptodate(page)) { + if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) + SetPageChecked(page); + else { + err = do_readpage(page); + if (err) { + unlock_page(page); + page_cache_release(page); + return err; + } + } + + SetPageUptodate(page); + ClearPageError(page); + } + + if (PagePrivate(page)) + /* + * The page is dirty, which means it was budgeted twice: + * o first time the budget was allocated by the task which + * made the page dirty and set the PG_private flag; + * o and then we budgeted for it for the second time at the + * very beginning of this function. + * + * So what we have to do is to release the page budget we + * allocated. + */ + release_new_page_budget(c); + else if (!PageChecked(page)) + /* + * We are changing a page which already exists on the media. + * This means that changing the page does not make the amount + * of indexing information larger, and this part of the budget + * which we have already acquired may be released. + */ + ubifs_convert_page_budget(c); + + if (appending) { + struct ubifs_inode *ui = ubifs_inode(inode); + + /* + * 'ubifs_write_end()' is optimized from the fast-path part of + * 'ubifs_write_begin()' and expects the @ui_mutex to be locked + * if data is appended. + */ + mutex_lock(&ui->ui_mutex); + if (ui->dirty) + /* + * The inode is dirty already, so we may free the + * budget we allocated. + */ + ubifs_release_dirty_inode_budget(c, ui); + } + + *pagep = page; + return 0; +} + +/** + * allocate_budget - allocate budget for 'ubifs_write_begin()'. + * @c: UBIFS file-system description object + * @page: page to allocate budget for + * @ui: UBIFS inode object the page belongs to + * @appending: non-zero if the page is appended + * + * This is a helper function for 'ubifs_write_begin()' which allocates budget + * for the operation. The budget is allocated differently depending on whether + * this is appending, whether the page is dirty or not, and so on. This + * function leaves the @ui->ui_mutex locked in case of appending. Returns zero + * in case of success and %-ENOSPC in case of failure. + */ +static int allocate_budget(struct ubifs_info *c, struct page *page, + struct ubifs_inode *ui, int appending) +{ + struct ubifs_budget_req req = { .fast = 1 }; + + if (PagePrivate(page)) { + if (!appending) + /* + * The page is dirty and we are not appending, which + * means no budget is needed at all. + */ + return 0; + + mutex_lock(&ui->ui_mutex); + if (ui->dirty) + /* + * The page is dirty and we are appending, so the inode + * has to be marked as dirty. However, it is already + * dirty, so we do not need any budget. We may return, + * but @ui->ui_mutex hast to be left locked because we + * should prevent write-back from flushing the inode + * and freeing the budget. The lock will be released in + * 'ubifs_write_end()'. + */ + return 0; + + /* + * The page is dirty, we are appending, the inode is clean, so + * we need to budget the inode change. + */ + req.dirtied_ino = 1; + } else { + if (PageChecked(page)) + /* + * The page corresponds to a hole and does not + * exist on the media. So changing it makes + * make the amount of indexing information + * larger, and we have to budget for a new + * page. + */ + req.new_page = 1; + else + /* + * Not a hole, the change will not add any new + * indexing information, budget for page + * change. + */ + req.dirtied_page = 1; + + if (appending) { + mutex_lock(&ui->ui_mutex); + if (!ui->dirty) + /* + * The inode is clean but we will have to mark + * it as dirty because we are appending. This + * needs a budget. + */ + req.dirtied_ino = 1; + } + } + + return ubifs_budget_space(c, &req); +} + +/* + * This function is called when a page of data is going to be written. Since + * the page of data will not necessarily go to the flash straight away, UBIFS + * has to reserve space on the media for it, which is done by means of + * budgeting. + * + * This is the hot-path of the file-system and we are trying to optimize it as + * much as possible. For this reasons it is split on 2 parts - slow and fast. + * + * There many budgeting cases: + * o a new page is appended - we have to budget for a new page and for + * changing the inode; however, if the inode is already dirty, there is + * no need to budget for it; + * o an existing clean page is changed - we have budget for it; if the page + * does not exist on the media (a hole), we have to budget for a new + * page; otherwise, we may budget for changing an existing page; the + * difference between these cases is that changing an existing page does + * not introduce anything new to the FS indexing information, so it does + * not grow, and smaller budget is acquired in this case; + * o an existing dirty page is changed - no need to budget at all, because + * the page budget has been acquired by earlier, when the page has been + * marked dirty. + * + * UBIFS budgeting sub-system may force write-back if it thinks there is no + * space to reserve. This imposes some locking restrictions and makes it + * impossible to take into account the above cases, and makes it impossible to + * optimize budgeting. + * + * The solution for this is that the fast path of 'ubifs_write_begin()' assumes + * there is a plenty of flash space and the budget will be acquired quickly, + * without forcing write-back. The slow path does not make this assumption. + */ +static int ubifs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + int uninitialized_var(err), appending = !!(pos + len > inode->i_size); + int skipped_read = 0; + struct page *page; + + ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); + + if (unlikely(c->ro_media)) + return -EROFS; + + /* Try out the fast-path part first */ + page = __grab_cache_page(mapping, index); + if (unlikely(!page)) + return -ENOMEM; + + if (!PageUptodate(page)) { + /* The page is not loaded from the flash */ + if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) { + /* + * We change whole page so no need to load it. But we + * have to set the @PG_checked flag to make the further + * code the page is new. This might be not true, but it + * is better to budget more that to read the page from + * the media. + */ + SetPageChecked(page); + skipped_read = 1; + } else { + err = do_readpage(page); + if (err) { + unlock_page(page); + page_cache_release(page); + return err; + } + } + + SetPageUptodate(page); + ClearPageError(page); + } + + err = allocate_budget(c, page, ui, appending); + if (unlikely(err)) { + ubifs_assert(err == -ENOSPC); + /* + * If we skipped reading the page because we were going to + * write all of it, then it is not up to date. + */ + if (skipped_read) { + ClearPageChecked(page); + ClearPageUptodate(page); + } + /* + * Budgeting failed which means it would have to force + * write-back but didn't, because we set the @fast flag in the + * request. Write-back cannot be done now, while we have the + * page locked, because it would deadlock. Unlock and free + * everything and fall-back to slow-path. + */ + if (appending) { + ubifs_assert(mutex_is_locked(&ui->ui_mutex)); + mutex_unlock(&ui->ui_mutex); + } + unlock_page(page); + page_cache_release(page); + + return write_begin_slow(mapping, pos, len, pagep); + } + + /* + * Whee, we aquired budgeting quickly - without involving + * garbage-collection, committing or forceing write-back. We return + * with @ui->ui_mutex locked if we are appending pages, and unlocked + * otherwise. This is an optimization (slightly hacky though). + */ + *pagep = page; + return 0; + +} + +/** + * cancel_budget - cancel budget. + * @c: UBIFS file-system description object + * @page: page to cancel budget for + * @ui: UBIFS inode object the page belongs to + * @appending: non-zero if the page is appended + * + * This is a helper function for a page write operation. It unlocks the + * @ui->ui_mutex in case of appending. + */ +static void cancel_budget(struct ubifs_info *c, struct page *page, + struct ubifs_inode *ui, int appending) +{ + if (appending) { + if (!ui->dirty) + ubifs_release_dirty_inode_budget(c, ui); + mutex_unlock(&ui->ui_mutex); + } + if (!PagePrivate(page)) { + if (PageChecked(page)) + release_new_page_budget(c); + else + release_existing_page_budget(c); + } +} + +static int ubifs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_info *c = inode->i_sb->s_fs_info; + loff_t end_pos = pos + len; + int appending = !!(end_pos > inode->i_size); + + dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld", + inode->i_ino, pos, page->index, len, copied, inode->i_size); + + if (unlikely(copied < len && len == PAGE_CACHE_SIZE)) { + /* + * VFS copied less data to the page that it intended and + * declared in its '->write_begin()' call via the @len + * argument. If the page was not up-to-date, and @len was + * @PAGE_CACHE_SIZE, the 'ubifs_write_begin()' function did + * not load it from the media (for optimization reasons). This + * means that part of the page contains garbage. So read the + * page now. + */ + dbg_gen("copied %d instead of %d, read page and repeat", + copied, len); + cancel_budget(c, page, ui, appending); + + /* + * Return 0 to force VFS to repeat the whole operation, or the + * error code if 'do_readpage()' failes. + */ + copied = do_readpage(page); + goto out; + } + + if (!PagePrivate(page)) { + SetPagePrivate(page); + atomic_long_inc(&c->dirty_pg_cnt); + __set_page_dirty_nobuffers(page); + } + + if (appending) { + i_size_write(inode, end_pos); + ui->ui_size = end_pos; + /* + * Note, we do not set @I_DIRTY_PAGES (which means that the + * inode has dirty pages), this has been done in + * '__set_page_dirty_nobuffers()'. + */ + __mark_inode_dirty(inode, I_DIRTY_DATASYNC); + ubifs_assert(mutex_is_locked(&ui->ui_mutex)); + mutex_unlock(&ui->ui_mutex); + } + +out: + unlock_page(page); + page_cache_release(page); + return copied; +} + +/** + * populate_page - copy data nodes into a page for bulk-read. + * @c: UBIFS file-system description object + * @page: page + * @bu: bulk-read information + * @n: next zbranch slot + * + * This function returns %0 on success and a negative error code on failure. + */ +static int populate_page(struct ubifs_info *c, struct page *page, + struct bu_info *bu, int *n) +{ + int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0; + struct inode *inode = page->mapping->host; + loff_t i_size = i_size_read(inode); + unsigned int page_block; + void *addr, *zaddr; + pgoff_t end_index; + + dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", + inode->i_ino, page->index, i_size, page->flags); + + addr = zaddr = kmap(page); + + end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; + if (!i_size || page->index > end_index) { + hole = 1; + memset(addr, 0, PAGE_CACHE_SIZE); + goto out_hole; + } + + page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + while (1) { + int err, len, out_len, dlen; + + if (nn >= bu->cnt) { + hole = 1; + memset(addr, 0, UBIFS_BLOCK_SIZE); + } else if (key_block(c, &bu->zbranch[nn].key) == page_block) { + struct ubifs_data_node *dn; + + dn = bu->buf + (bu->zbranch[nn].offs - offs); + + ubifs_assert(le64_to_cpu(dn->ch.sqnum) > + ubifs_inode(inode)->creat_sqnum); + + len = le32_to_cpu(dn->size); + if (len <= 0 || len > UBIFS_BLOCK_SIZE) + goto out_err; + + dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + out_len = UBIFS_BLOCK_SIZE; + err = ubifs_decompress(&dn->data, dlen, addr, &out_len, + le16_to_cpu(dn->compr_type)); + if (err || len != out_len) + goto out_err; + + if (len < UBIFS_BLOCK_SIZE) + memset(addr + len, 0, UBIFS_BLOCK_SIZE - len); + + nn += 1; + read = (i << UBIFS_BLOCK_SHIFT) + len; + } else if (key_block(c, &bu->zbranch[nn].key) < page_block) { + nn += 1; + continue; + } else { + hole = 1; + memset(addr, 0, UBIFS_BLOCK_SIZE); + } + if (++i >= UBIFS_BLOCKS_PER_PAGE) + break; + addr += UBIFS_BLOCK_SIZE; + page_block += 1; + } + + if (end_index == page->index) { + int len = i_size & (PAGE_CACHE_SIZE - 1); + + if (len && len < read) + memset(zaddr + len, 0, read - len); + } + +out_hole: + if (hole) { + SetPageChecked(page); + dbg_gen("hole"); + } + + SetPageUptodate(page); + ClearPageError(page); + flush_dcache_page(page); + kunmap(page); + *n = nn; + return 0; + +out_err: + ClearPageUptodate(page); + SetPageError(page); + flush_dcache_page(page); + kunmap(page); + ubifs_err("bad data node (block %u, inode %lu)", + page_block, inode->i_ino); + return -EINVAL; +} + +/** + * ubifs_do_bulk_read - do bulk-read. + * @c: UBIFS file-system description object + * @bu: bulk-read information + * @page1: first page to read + * + * This function returns %1 if the bulk-read is done, otherwise %0 is returned. + */ +static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu, + struct page *page1) +{ + pgoff_t offset = page1->index, end_index; + struct address_space *mapping = page1->mapping; + struct inode *inode = mapping->host; + struct ubifs_inode *ui = ubifs_inode(inode); + int err, page_idx, page_cnt, ret = 0, n = 0; + int allocate = bu->buf ? 0 : 1; + loff_t isize; + + err = ubifs_tnc_get_bu_keys(c, bu); + if (err) + goto out_warn; + + if (bu->eof) { + /* Turn off bulk-read at the end of the file */ + ui->read_in_a_row = 1; + ui->bulk_read = 0; + } + + page_cnt = bu->blk_cnt >> UBIFS_BLOCKS_PER_PAGE_SHIFT; + if (!page_cnt) { + /* + * This happens when there are multiple blocks per page and the + * blocks for the first page we are looking for, are not + * together. If all the pages were like this, bulk-read would + * reduce performance, so we turn it off for a while. + */ + goto out_bu_off; + } + + if (bu->cnt) { + if (allocate) { + /* + * Allocate bulk-read buffer depending on how many data + * nodes we are going to read. + */ + bu->buf_len = bu->zbranch[bu->cnt - 1].offs + + bu->zbranch[bu->cnt - 1].len - + bu->zbranch[0].offs; + ubifs_assert(bu->buf_len > 0); + ubifs_assert(bu->buf_len <= c->leb_size); + bu->buf = kmalloc(bu->buf_len, GFP_NOFS | __GFP_NOWARN); + if (!bu->buf) + goto out_bu_off; + } + + err = ubifs_tnc_bulk_read(c, bu); + if (err) + goto out_warn; + } + + err = populate_page(c, page1, bu, &n); + if (err) + goto out_warn; + + unlock_page(page1); + ret = 1; + + isize = i_size_read(inode); + if (isize == 0) + goto out_free; + end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); + + for (page_idx = 1; page_idx < page_cnt; page_idx++) { + pgoff_t page_offset = offset + page_idx; + struct page *page; + + if (page_offset > end_index) + break; + page = find_or_create_page(mapping, page_offset, + GFP_NOFS | __GFP_COLD); + if (!page) + break; + if (!PageUptodate(page)) + err = populate_page(c, page, bu, &n); + unlock_page(page); + page_cache_release(page); + if (err) + break; + } + + ui->last_page_read = offset + page_idx - 1; + +out_free: + if (allocate) + kfree(bu->buf); + return ret; + +out_warn: + ubifs_warn("ignoring error %d and skipping bulk-read", err); + goto out_free; + +out_bu_off: + ui->read_in_a_row = ui->bulk_read = 0; + goto out_free; +} + +/** + * ubifs_bulk_read - determine whether to bulk-read and, if so, do it. + * @page: page from which to start bulk-read. + * + * Some flash media are capable of reading sequentially at faster rates. UBIFS + * bulk-read facility is designed to take advantage of that, by reading in one + * go consecutive data nodes that are also located consecutively in the same + * LEB. This function returns %1 if a bulk-read is done and %0 otherwise. + */ +static int ubifs_bulk_read(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + pgoff_t index = page->index, last_page_read = ui->last_page_read; + struct bu_info *bu; + int err = 0, allocated = 0; + + ui->last_page_read = index; + if (!c->bulk_read) + return 0; + + /* + * Bulk-read is protected by @ui->ui_mutex, but it is an optimization, + * so don't bother if we cannot lock the mutex. + */ + if (!mutex_trylock(&ui->ui_mutex)) + return 0; + + if (index != last_page_read + 1) { + /* Turn off bulk-read if we stop reading sequentially */ + ui->read_in_a_row = 1; + if (ui->bulk_read) + ui->bulk_read = 0; + goto out_unlock; + } + + if (!ui->bulk_read) { + ui->read_in_a_row += 1; + if (ui->read_in_a_row < 3) + goto out_unlock; + /* Three reads in a row, so switch on bulk-read */ + ui->bulk_read = 1; + } + + /* + * If possible, try to use pre-allocated bulk-read information, which + * is protected by @c->bu_mutex. + */ + if (mutex_trylock(&c->bu_mutex)) + bu = &c->bu; + else { + bu = kmalloc(sizeof(struct bu_info), GFP_NOFS | __GFP_NOWARN); + if (!bu) + goto out_unlock; + + bu->buf = NULL; + allocated = 1; + } + + bu->buf_len = c->max_bu_buf_len; + data_key_init(c, &bu->key, inode->i_ino, + page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT); + err = ubifs_do_bulk_read(c, bu, page); + + if (!allocated) + mutex_unlock(&c->bu_mutex); + else + kfree(bu); + +out_unlock: + mutex_unlock(&ui->ui_mutex); + return err; +} + +static int ubifs_readpage(struct file *file, struct page *page) +{ + if (ubifs_bulk_read(page)) + return 0; + do_readpage(page); + unlock_page(page); + return 0; +} + +static int do_writepage(struct page *page, int len) +{ + int err = 0, i, blen; + unsigned int block; + void *addr; + union ubifs_key key; + struct inode *inode = page->mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + +#ifdef UBIFS_DEBUG + spin_lock(&ui->ui_lock); + ubifs_assert(page->index <= ui->synced_i_size << PAGE_CACHE_SIZE); + spin_unlock(&ui->ui_lock); +#endif + + /* Update radix tree tags */ + set_page_writeback(page); + + addr = kmap(page); + block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; + i = 0; + while (len) { + blen = min_t(int, len, UBIFS_BLOCK_SIZE); + data_key_init(c, &key, inode->i_ino, block); + err = ubifs_jnl_write_data(c, inode, &key, addr, blen); + if (err) + break; + if (++i >= UBIFS_BLOCKS_PER_PAGE) + break; + block += 1; + addr += blen; + len -= blen; + } + if (err) { + SetPageError(page); + ubifs_err("cannot write page %lu of inode %lu, error %d", + page->index, inode->i_ino, err); + ubifs_ro_mode(c, err); + } + + ubifs_assert(PagePrivate(page)); + if (PageChecked(page)) + release_new_page_budget(c); + else + release_existing_page_budget(c); + + atomic_long_dec(&c->dirty_pg_cnt); + ClearPagePrivate(page); + ClearPageChecked(page); + + kunmap(page); + unlock_page(page); + end_page_writeback(page); + return err; +} + +/* + * When writing-back dirty inodes, VFS first writes-back pages belonging to the + * inode, then the inode itself. For UBIFS this may cause a problem. Consider a + * situation when a we have an inode with size 0, then a megabyte of data is + * appended to the inode, then write-back starts and flushes some amount of the + * dirty pages, the journal becomes full, commit happens and finishes, and then + * an unclean reboot happens. When the file system is mounted next time, the + * inode size would still be 0, but there would be many pages which are beyond + * the inode size, they would be indexed and consume flash space. Because the + * journal has been committed, the replay would not be able to detect this + * situation and correct the inode size. This means UBIFS would have to scan + * whole index and correct all inode sizes, which is long an unacceptable. + * + * To prevent situations like this, UBIFS writes pages back only if they are + * within the last synchronized inode size, i.e. the size which has been + * written to the flash media last time. Otherwise, UBIFS forces inode + * write-back, thus making sure the on-flash inode contains current inode size, + * and then keeps writing pages back. + * + * Some locking issues explanation. 'ubifs_writepage()' first is called with + * the page locked, and it locks @ui_mutex. However, write-back does take inode + * @i_mutex, which means other VFS operations may be run on this inode at the + * same time. And the problematic one is truncation to smaller size, from where + * we have to call 'vmtruncate()', which first changes @inode->i_size, then + * drops the truncated pages. And while dropping the pages, it takes the page + * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with + * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This + * means that @inode->i_size is changed while @ui_mutex is unlocked. + * + * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond + * inode size. How do we do this if @inode->i_size may became smaller while we + * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the + * @ui->ui_isize "shadow" field which UBIFS uses instead of @inode->i_size + * internally and updates it under @ui_mutex. + * + * Q: why we do not worry that if we race with truncation, we may end up with a + * situation when the inode is truncated while we are in the middle of + * 'do_writepage()', so we do write beyond inode size? + * A: If we are in the middle of 'do_writepage()', truncation would be locked + * on the page lock and it would not write the truncated inode node to the + * journal before we have finished. + */ +static int ubifs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct inode *inode = page->mapping->host; + struct ubifs_inode *ui = ubifs_inode(inode); + loff_t i_size = i_size_read(inode), synced_i_size; + pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; + int err, len = i_size & (PAGE_CACHE_SIZE - 1); + void *kaddr; + + dbg_gen("ino %lu, pg %lu, pg flags %#lx", + inode->i_ino, page->index, page->flags); + ubifs_assert(PagePrivate(page)); + + /* Is the page fully outside @i_size? (truncate in progress) */ + if (page->index > end_index || (page->index == end_index && !len)) { + err = 0; + goto out_unlock; + } + + spin_lock(&ui->ui_lock); + synced_i_size = ui->synced_i_size; + spin_unlock(&ui->ui_lock); + + /* Is the page fully inside @i_size? */ + if (page->index < end_index) { + if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) { + err = inode->i_sb->s_op->write_inode(inode, 1); + if (err) + goto out_unlock; + /* + * The inode has been written, but the write-buffer has + * not been synchronized, so in case of an unclean + * reboot we may end up with some pages beyond inode + * size, but they would be in the journal (because + * commit flushes write buffers) and recovery would deal + * with this. + */ + } + return do_writepage(page, PAGE_CACHE_SIZE); + } + + /* + * The page straddles @i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the page size. For a file that is not a multiple of + * the page size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + len, 0, PAGE_CACHE_SIZE - len); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + + if (i_size > synced_i_size) { + err = inode->i_sb->s_op->write_inode(inode, 1); + if (err) + goto out_unlock; + } + + return do_writepage(page, len); + +out_unlock: + unlock_page(page); + return err; +} + +/** + * do_attr_changes - change inode attributes. + * @inode: inode to change attributes for + * @attr: describes attributes to change + */ +static void do_attr_changes(struct inode *inode, const struct iattr *attr) +{ + if (attr->ia_valid & ATTR_UID) + inode->i_uid = attr->ia_uid; + if (attr->ia_valid & ATTR_GID) + inode->i_gid = attr->ia_gid; + if (attr->ia_valid & ATTR_ATIME) + inode->i_atime = timespec_trunc(attr->ia_atime, + inode->i_sb->s_time_gran); + if (attr->ia_valid & ATTR_MTIME) + inode->i_mtime = timespec_trunc(attr->ia_mtime, + inode->i_sb->s_time_gran); + if (attr->ia_valid & ATTR_CTIME) + inode->i_ctime = timespec_trunc(attr->ia_ctime, + inode->i_sb->s_time_gran); + if (attr->ia_valid & ATTR_MODE) { + umode_t mode = attr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + inode->i_mode = mode; + } +} + +/** + * do_truncation - truncate an inode. + * @c: UBIFS file-system description object + * @inode: inode to truncate + * @attr: inode attribute changes description + * + * This function implements VFS '->setattr()' call when the inode is truncated + * to a smaller size. Returns zero in case of success and a negative error code + * in case of failure. + */ +static int do_truncation(struct ubifs_info *c, struct inode *inode, + const struct iattr *attr) +{ + int err; + struct ubifs_budget_req req; + loff_t old_size = inode->i_size, new_size = attr->ia_size; + int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1; + struct ubifs_inode *ui = ubifs_inode(inode); + + dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size); + memset(&req, 0, sizeof(struct ubifs_budget_req)); + + /* + * If this is truncation to a smaller size, and we do not truncate on a + * block boundary, budget for changing one data block, because the last + * block will be re-written. + */ + if (new_size & (UBIFS_BLOCK_SIZE - 1)) + req.dirtied_page = 1; + + req.dirtied_ino = 1; + /* A funny way to budget for truncation node */ + req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ; + err = ubifs_budget_space(c, &req); + if (err) { + /* + * Treat truncations to zero as deletion and always allow them, + * just like we do for '->unlink()'. + */ + if (new_size || err != -ENOSPC) + return err; + budgeted = 0; + } + + err = vmtruncate(inode, new_size); + if (err) + goto out_budg; + + if (offset) { + pgoff_t index = new_size >> PAGE_CACHE_SHIFT; + struct page *page; + + page = find_lock_page(inode->i_mapping, index); + if (page) { + if (PageDirty(page)) { + /* + * 'ubifs_jnl_truncate()' will try to truncate + * the last data node, but it contains + * out-of-date data because the page is dirty. + * Write the page now, so that + * 'ubifs_jnl_truncate()' will see an already + * truncated (and up to date) data node. + */ + ubifs_assert(PagePrivate(page)); + + clear_page_dirty_for_io(page); + if (UBIFS_BLOCKS_PER_PAGE_SHIFT) + offset = new_size & + (PAGE_CACHE_SIZE - 1); + err = do_writepage(page, offset); + page_cache_release(page); + if (err) + goto out_budg; + /* + * We could now tell 'ubifs_jnl_truncate()' not + * to read the last block. + */ + } else { + /* + * We could 'kmap()' the page and pass the data + * to 'ubifs_jnl_truncate()' to save it from + * having to read it. + */ + unlock_page(page); + page_cache_release(page); + } + } + } + + mutex_lock(&ui->ui_mutex); + ui->ui_size = inode->i_size; + /* Truncation changes inode [mc]time */ + inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + /* The other attributes may be changed at the same time as well */ + do_attr_changes(inode, attr); + + err = ubifs_jnl_truncate(c, inode, old_size, new_size); + mutex_unlock(&ui->ui_mutex); +out_budg: + if (budgeted) + ubifs_release_budget(c, &req); + else { + c->nospace = c->nospace_rp = 0; + smp_wmb(); + } + return err; +} + +/** + * do_setattr - change inode attributes. + * @c: UBIFS file-system description object + * @inode: inode to change attributes for + * @attr: inode attribute changes description + * + * This function implements VFS '->setattr()' call for all cases except + * truncations to smaller size. Returns zero in case of success and a negative + * error code in case of failure. + */ +static int do_setattr(struct ubifs_info *c, struct inode *inode, + const struct iattr *attr) +{ + int err, release; + loff_t new_size = attr->ia_size; + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_budget_req req = { .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + if (attr->ia_valid & ATTR_SIZE) { + dbg_gen("size %lld -> %lld", inode->i_size, new_size); + err = vmtruncate(inode, new_size); + if (err) + goto out; + } + + mutex_lock(&ui->ui_mutex); + if (attr->ia_valid & ATTR_SIZE) { + /* Truncation changes inode [mc]time */ + inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + /* 'vmtruncate()' changed @i_size, update @ui_size */ + ui->ui_size = inode->i_size; + } + + do_attr_changes(inode, attr); + + release = ui->dirty; + if (attr->ia_valid & ATTR_SIZE) + /* + * Inode length changed, so we have to make sure + * @I_DIRTY_DATASYNC is set. + */ + __mark_inode_dirty(inode, I_DIRTY_SYNC | I_DIRTY_DATASYNC); + else + mark_inode_dirty_sync(inode); + mutex_unlock(&ui->ui_mutex); + + if (release) + ubifs_release_budget(c, &req); + if (IS_SYNC(inode)) + err = inode->i_sb->s_op->write_inode(inode, 1); + return err; + +out: + ubifs_release_budget(c, &req); + return err; +} + +int ubifs_setattr(struct dentry *dentry, struct iattr *attr) +{ + int err; + struct inode *inode = dentry->d_inode; + struct ubifs_info *c = inode->i_sb->s_fs_info; + + dbg_gen("ino %lu, mode %#x, ia_valid %#x", + inode->i_ino, inode->i_mode, attr->ia_valid); + err = inode_change_ok(inode, attr); + if (err) + return err; + + err = dbg_check_synced_i_size(inode); + if (err) + return err; + + if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size < inode->i_size) + /* Truncation to a smaller size */ + err = do_truncation(c, inode, attr); + else + err = do_setattr(c, inode, attr); + + return err; +} + +static void ubifs_invalidatepage(struct page *page, unsigned long offset) +{ + struct inode *inode = page->mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + + ubifs_assert(PagePrivate(page)); + if (offset) + /* Partial page remains dirty */ + return; + + if (PageChecked(page)) + release_new_page_budget(c); + else + release_existing_page_budget(c); + + atomic_long_dec(&c->dirty_pg_cnt); + ClearPagePrivate(page); + ClearPageChecked(page); +} + +static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + struct ubifs_inode *ui = ubifs_inode(dentry->d_inode); + + nd_set_link(nd, ui->data); + return NULL; +} + +int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + struct ubifs_info *c = inode->i_sb->s_fs_info; + int err; + + dbg_gen("syncing inode %lu", inode->i_ino); + + /* + * VFS has already synchronized dirty pages for this inode. Synchronize + * the inode unless this is a 'datasync()' call. + */ + if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { + err = inode->i_sb->s_op->write_inode(inode, 1); + if (err) + return err; + } + + /* + * Nodes related to this inode may still sit in a write-buffer. Flush + * them. + */ + err = ubifs_sync_wbufs_by_inode(c, inode); + if (err) + return err; + + return 0; +} + +/** + * mctime_update_needed - check if mtime or ctime update is needed. + * @inode: the inode to do the check for + * @now: current time + * + * This helper function checks if the inode mtime/ctime should be updated or + * not. If current values of the time-stamps are within the UBIFS inode time + * granularity, they are not updated. This is an optimization. + */ +static inline int mctime_update_needed(struct inode *inode, + struct timespec *now) +{ + if (!timespec_equal(&inode->i_mtime, now) || + !timespec_equal(&inode->i_ctime, now)) + return 1; + return 0; +} + +/** + * update_ctime - update mtime and ctime of an inode. + * @c: UBIFS file-system description object + * @inode: inode to update + * + * This function updates mtime and ctime of the inode if it is not equivalent to + * current time. Returns zero in case of success and a negative error code in + * case of failure. + */ +static int update_mctime(struct ubifs_info *c, struct inode *inode) +{ + struct timespec now = ubifs_current_time(inode); + struct ubifs_inode *ui = ubifs_inode(inode); + + if (mctime_update_needed(inode, &now)) { + int err, release; + struct ubifs_budget_req req = { .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(ui->data_len, 8) }; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + mutex_lock(&ui->ui_mutex); + inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + release = ui->dirty; + mark_inode_dirty_sync(inode); + mutex_unlock(&ui->ui_mutex); + if (release) + ubifs_release_budget(c, &req); + } + + return 0; +} + +static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + int err; + ssize_t ret; + struct inode *inode = iocb->ki_filp->f_mapping->host; + struct ubifs_info *c = inode->i_sb->s_fs_info; + + err = update_mctime(c, inode); + if (err) + return err; + + ret = generic_file_aio_write(iocb, iov, nr_segs, pos); + if (ret < 0) + return ret; + + if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) { + err = ubifs_sync_wbufs_by_inode(c, inode); + if (err) + return err; + } + + return ret; +} + +static int ubifs_set_page_dirty(struct page *page) +{ + int ret; + + ret = __set_page_dirty_nobuffers(page); + /* + * An attempt to dirty a page without budgeting for it - should not + * happen. + */ + ubifs_assert(ret == 0); + return ret; +} + +static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags) +{ + /* + * An attempt to release a dirty page without budgeting for it - should + * not happen. + */ + if (PageWriteback(page)) + return 0; + ubifs_assert(PagePrivate(page)); + ubifs_assert(0); + ClearPagePrivate(page); + ClearPageChecked(page); + return 1; +} + +/* + * mmap()d file has taken write protection fault and is being made + * writable. UBIFS must ensure page is budgeted for. + */ +static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct timespec now = ubifs_current_time(inode); + struct ubifs_budget_req req = { .new_page = 1 }; + int err, update_time; + + dbg_gen("ino %lu, pg %lu, i_size %lld", inode->i_ino, page->index, + i_size_read(inode)); + ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY)); + + if (unlikely(c->ro_media)) + return -EROFS; + + /* + * We have not locked @page so far so we may budget for changing the + * page. Note, we cannot do this after we locked the page, because + * budgeting may cause write-back which would cause deadlock. + * + * At the moment we do not know whether the page is dirty or not, so we + * assume that it is not and budget for a new page. We could look at + * the @PG_private flag and figure this out, but we may race with write + * back and the page state may change by the time we lock it, so this + * would need additional care. We do not bother with this at the + * moment, although it might be good idea to do. Instead, we allocate + * budget for a new page and amend it later on if the page was in fact + * dirty. + * + * The budgeting-related logic of this function is similar to what we + * do in 'ubifs_write_begin()' and 'ubifs_write_end()'. Glance there + * for more comments. + */ + update_time = mctime_update_needed(inode, &now); + if (update_time) + /* + * We have to change inode time stamp which requires extra + * budgeting. + */ + req.dirtied_ino = 1; + + err = ubifs_budget_space(c, &req); + if (unlikely(err)) { + if (err == -ENOSPC) + ubifs_warn("out of space for mmapped file " + "(inode number %lu)", inode->i_ino); + return err; + } + + lock_page(page); + if (unlikely(page->mapping != inode->i_mapping || + page_offset(page) > i_size_read(inode))) { + /* Page got truncated out from underneath us */ + err = -EINVAL; + goto out_unlock; + } + + if (PagePrivate(page)) + release_new_page_budget(c); + else { + if (!PageChecked(page)) + ubifs_convert_page_budget(c); + SetPagePrivate(page); + atomic_long_inc(&c->dirty_pg_cnt); + __set_page_dirty_nobuffers(page); + } + + if (update_time) { + int release; + struct ubifs_inode *ui = ubifs_inode(inode); + + mutex_lock(&ui->ui_mutex); + inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + release = ui->dirty; + mark_inode_dirty_sync(inode); + mutex_unlock(&ui->ui_mutex); + if (release) + ubifs_release_dirty_inode_budget(c, ui); + } + + unlock_page(page); + return 0; + +out_unlock: + unlock_page(page); + ubifs_release_budget(c, &req); + return err; +} + +static struct vm_operations_struct ubifs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = ubifs_vm_page_mkwrite, +}; + +static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + int err; + + /* 'generic_file_mmap()' takes care of NOMMU case */ + err = generic_file_mmap(file, vma); + if (err) + return err; + vma->vm_ops = &ubifs_file_vm_ops; + return 0; +} + +const struct address_space_operations ubifs_file_address_operations = { + .readpage = ubifs_readpage, + .writepage = ubifs_writepage, + .write_begin = ubifs_write_begin, + .write_end = ubifs_write_end, + .invalidatepage = ubifs_invalidatepage, + .set_page_dirty = ubifs_set_page_dirty, + .releasepage = ubifs_releasepage, +}; + +const struct inode_operations ubifs_file_inode_operations = { + .setattr = ubifs_setattr, + .getattr = ubifs_getattr, +#ifdef CONFIG_UBIFS_FS_XATTR + .setxattr = ubifs_setxattr, + .getxattr = ubifs_getxattr, + .listxattr = ubifs_listxattr, + .removexattr = ubifs_removexattr, +#endif +}; + +const struct inode_operations ubifs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = ubifs_follow_link, + .setattr = ubifs_setattr, + .getattr = ubifs_getattr, +}; + +const struct file_operations ubifs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .aio_read = generic_file_aio_read, + .aio_write = ubifs_aio_write, + .mmap = ubifs_file_mmap, + .fsync = ubifs_fsync, + .unlocked_ioctl = ubifs_ioctl, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, +#ifdef CONFIG_COMPAT + .compat_ioctl = ubifs_compat_ioctl, +#endif +}; diff -Nurd linux-2.6.24/fs/ubifs/find.c ubifs-v2.6.24/fs/ubifs/find.c --- linux-2.6.24/fs/ubifs/find.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/find.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,977 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file contains functions for finding LEBs for various purposes e.g. + * garbage collection. In general, lprops category heaps and lists are used + * for fast access, falling back on scanning the LPT as a last resort. + */ + +#include +#include "ubifs.h" + +/** + * struct scan_data - data provided to scan callback functions + * @min_space: minimum number of bytes for which to scan + * @pick_free: whether it is OK to scan for empty LEBs + * @lnum: LEB number found is returned here + * @exclude_index: whether to exclude index LEBs + */ +struct scan_data { + int min_space; + int pick_free; + int lnum; + int exclude_index; +}; + +/** + * valuable - determine whether LEB properties are valuable. + * @c: the UBIFS file-system description object + * @lprops: LEB properties + * + * This function return %1 if the LEB properties should be added to the LEB + * properties tree in memory. Otherwise %0 is returned. + */ +static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops) +{ + int n, cat = lprops->flags & LPROPS_CAT_MASK; + struct ubifs_lpt_heap *heap; + + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + heap = &c->lpt_heap[cat - 1]; + if (heap->cnt < heap->max_cnt) + return 1; + if (lprops->free + lprops->dirty >= c->dark_wm) + return 1; + return 0; + case LPROPS_EMPTY: + n = c->lst.empty_lebs + c->freeable_cnt - + c->lst.taken_empty_lebs; + if (n < c->lsave_cnt) + return 1; + return 0; + case LPROPS_FREEABLE: + return 1; + case LPROPS_FRDI_IDX: + return 1; + } + return 0; +} + +/** + * scan_for_dirty_cb - dirty space scan callback. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @data: information passed to and from the caller of the scan + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_for_dirty_cb(struct ubifs_info *c, + const struct ubifs_lprops *lprops, int in_tree, + struct scan_data *data) +{ + int ret = LPT_SCAN_CONTINUE; + + /* Exclude LEBs that are currently in use */ + if (lprops->flags & LPROPS_TAKEN) + return LPT_SCAN_CONTINUE; + /* Determine whether to add these LEB properties to the tree */ + if (!in_tree && valuable(c, lprops)) + ret |= LPT_SCAN_ADD; + /* Exclude LEBs with too little space */ + if (lprops->free + lprops->dirty < data->min_space) + return ret; + /* If specified, exclude index LEBs */ + if (data->exclude_index && lprops->flags & LPROPS_INDEX) + return ret; + /* If specified, exclude empty or freeable LEBs */ + if (lprops->free + lprops->dirty == c->leb_size) { + if (!data->pick_free) + return ret; + /* Exclude LEBs with too little dirty space (unless it is empty) */ + } else if (lprops->dirty < c->dead_wm) + return ret; + /* Finally we found space */ + data->lnum = lprops->lnum; + return LPT_SCAN_ADD | LPT_SCAN_STOP; +} + +/** + * scan_for_dirty - find a data LEB with free space. + * @c: the UBIFS file-system description object + * @min_space: minimum amount free plus dirty space the returned LEB has to + * have + * @pick_free: if it is OK to return a free or freeable LEB + * @exclude_index: whether to exclude index LEBs + * + * This function returns a pointer to the LEB properties found or a negative + * error code. + */ +static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c, + int min_space, int pick_free, + int exclude_index) +{ + const struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + struct scan_data data; + int err, i; + + /* There may be an LEB with enough dirty space on the free heap */ + heap = &c->lpt_heap[LPROPS_FREE - 1]; + for (i = 0; i < heap->cnt; i++) { + lprops = heap->arr[i]; + if (lprops->free + lprops->dirty < min_space) + continue; + if (lprops->dirty < c->dead_wm) + continue; + return lprops; + } + /* + * A LEB may have fallen off of the bottom of the dirty heap, and ended + * up as uncategorized even though it has enough dirty space for us now, + * so check the uncategorized list. N.B. neither empty nor freeable LEBs + * can end up as uncategorized because they are kept on lists not + * finite-sized heaps. + */ + list_for_each_entry(lprops, &c->uncat_list, list) { + if (lprops->flags & LPROPS_TAKEN) + continue; + if (lprops->free + lprops->dirty < min_space) + continue; + if (exclude_index && (lprops->flags & LPROPS_INDEX)) + continue; + if (lprops->dirty < c->dead_wm) + continue; + return lprops; + } + /* We have looked everywhere in main memory, now scan the flash */ + if (c->pnodes_have >= c->pnode_cnt) + /* All pnodes are in memory, so skip scan */ + return ERR_PTR(-ENOSPC); + data.min_space = min_space; + data.pick_free = pick_free; + data.lnum = -1; + data.exclude_index = exclude_index; + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, + (ubifs_lpt_scan_callback)scan_for_dirty_cb, + &data); + if (err) + return ERR_PTR(err); + ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt); + c->lscan_lnum = data.lnum; + lprops = ubifs_lpt_lookup_dirty(c, data.lnum); + if (IS_ERR(lprops)) + return lprops; + ubifs_assert(lprops->lnum == data.lnum); + ubifs_assert(lprops->free + lprops->dirty >= min_space); + ubifs_assert(lprops->dirty >= c->dead_wm || + (pick_free && + lprops->free + lprops->dirty == c->leb_size)); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!exclude_index || !(lprops->flags & LPROPS_INDEX)); + return lprops; +} + +/** + * ubifs_find_dirty_leb - find a dirty LEB for the Garbage Collector. + * @c: the UBIFS file-system description object + * @ret_lp: LEB properties are returned here on exit + * @min_space: minimum amount free plus dirty space the returned LEB has to + * have + * @pick_free: controls whether it is OK to pick empty or index LEBs + * + * This function tries to find a dirty logical eraseblock which has at least + * @min_space free and dirty space. It prefers to take an LEB from the dirty or + * dirty index heap, and it falls-back to LPT scanning if the heaps are empty + * or do not have an LEB which satisfies the @min_space criteria. + * + * Note, LEBs which have less than dead watermark of free + dirty space are + * never picked by this function. + * + * The additional @pick_free argument controls if this function has to return a + * free or freeable LEB if one is present. For example, GC must to set it to %1, + * when called from the journal space reservation function, because the + * appearance of free space may coincide with the loss of enough dirty space + * for GC to succeed anyway. + * + * In contrast, if the Garbage Collector is called from budgeting, it should + * just make free space, not return LEBs which are already free or freeable. + * + * In addition @pick_free is set to %2 by the recovery process in order to + * recover gc_lnum in which case an index LEB must not be returned. + * + * This function returns zero and the LEB properties of found dirty LEB in case + * of success, %-ENOSPC if no dirty LEB was found and a negative error code in + * case of other failures. The returned LEB is marked as "taken". + */ +int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, + int min_space, int pick_free) +{ + int err = 0, sum, exclude_index = pick_free == 2 ? 1 : 0; + const struct ubifs_lprops *lp = NULL, *idx_lp = NULL; + struct ubifs_lpt_heap *heap, *idx_heap; + + ubifs_get_lprops(c); + + if (pick_free) { + int lebs, rsvd_idx_lebs = 0; + + spin_lock(&c->space_lock); + lebs = c->lst.empty_lebs + c->idx_gc_cnt; + lebs += c->freeable_cnt - c->lst.taken_empty_lebs; + + /* + * Note, the index may consume more LEBs than have been reserved + * for it. It is OK because it might be consolidated by GC. + * But if the index takes fewer LEBs than it is reserved for it, + * this function must avoid picking those reserved LEBs. + */ + if (c->min_idx_lebs >= c->lst.idx_lebs) { + rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; + exclude_index = 1; + } + spin_unlock(&c->space_lock); + + /* Check if there are enough free LEBs for the index */ + if (rsvd_idx_lebs < lebs) { + /* OK, try to find an empty LEB */ + lp = ubifs_fast_find_empty(c); + if (lp) + goto found; + + /* Or a freeable LEB */ + lp = ubifs_fast_find_freeable(c); + if (lp) + goto found; + } else + /* + * We cannot pick free/freeable LEBs in the below code. + */ + pick_free = 0; + } else { + spin_lock(&c->space_lock); + exclude_index = (c->min_idx_lebs >= c->lst.idx_lebs); + spin_unlock(&c->space_lock); + } + + /* Look on the dirty and dirty index heaps */ + heap = &c->lpt_heap[LPROPS_DIRTY - 1]; + idx_heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; + + if (idx_heap->cnt && !exclude_index) { + idx_lp = idx_heap->arr[0]; + sum = idx_lp->free + idx_lp->dirty; + /* + * Since we reserve thrice as much space for the index than it + * actually takes, it does not make sense to pick indexing LEBs + * with less than, say, half LEB of dirty space. May be half is + * not the optimal boundary - this should be tested and + * checked. This boundary should determine how much we use + * in-the-gaps to consolidate the index comparing to how much + * we use garbage collector to consolidate it. The "half" + * criteria just feels to be fine. + */ + if (sum < min_space || sum < c->half_leb_size) + idx_lp = NULL; + } + + if (heap->cnt) { + lp = heap->arr[0]; + if (lp->dirty + lp->free < min_space) + lp = NULL; + } + + /* Pick the LEB with most space */ + if (idx_lp && lp) { + if (idx_lp->free + idx_lp->dirty >= lp->free + lp->dirty) + lp = idx_lp; + } else if (idx_lp && !lp) + lp = idx_lp; + + if (lp) { + ubifs_assert(lp->free + lp->dirty >= c->dead_wm); + goto found; + } + + /* Did not find a dirty LEB on the dirty heaps, have to scan */ + dbg_find("scanning LPT for a dirty LEB"); + lp = scan_for_dirty(c, min_space, pick_free, exclude_index); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + ubifs_assert(lp->dirty >= c->dead_wm || + (pick_free && lp->free + lp->dirty == c->leb_size)); + +found: + dbg_find("found LEB %d, free %d, dirty %d, flags %#x", + lp->lnum, lp->free, lp->dirty, lp->flags); + + lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, + lp->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + memcpy(ret_lp, lp, sizeof(struct ubifs_lprops)); + +out: + ubifs_release_lprops(c); + return err; +} + +/** + * scan_for_free_cb - free space scan callback. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @data: information passed to and from the caller of the scan + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_for_free_cb(struct ubifs_info *c, + const struct ubifs_lprops *lprops, int in_tree, + struct scan_data *data) +{ + int ret = LPT_SCAN_CONTINUE; + + /* Exclude LEBs that are currently in use */ + if (lprops->flags & LPROPS_TAKEN) + return LPT_SCAN_CONTINUE; + /* Determine whether to add these LEB properties to the tree */ + if (!in_tree && valuable(c, lprops)) + ret |= LPT_SCAN_ADD; + /* Exclude index LEBs */ + if (lprops->flags & LPROPS_INDEX) + return ret; + /* Exclude LEBs with too little space */ + if (lprops->free < data->min_space) + return ret; + /* If specified, exclude empty LEBs */ + if (!data->pick_free && lprops->free == c->leb_size) + return ret; + /* + * LEBs that have only free and dirty space must not be allocated + * because they may have been unmapped already or they may have data + * that is obsolete only because of nodes that are still sitting in a + * wbuf. + */ + if (lprops->free + lprops->dirty == c->leb_size && lprops->dirty > 0) + return ret; + /* Finally we found space */ + data->lnum = lprops->lnum; + return LPT_SCAN_ADD | LPT_SCAN_STOP; +} + +/** + * do_find_free_space - find a data LEB with free space. + * @c: the UBIFS file-system description object + * @min_space: minimum amount of free space required + * @pick_free: whether it is OK to scan for empty LEBs + * @squeeze: whether to try to find space in a non-empty LEB first + * + * This function returns a pointer to the LEB properties found or a negative + * error code. + */ +static +const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c, + int min_space, int pick_free, + int squeeze) +{ + const struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + struct scan_data data; + int err, i; + + if (squeeze) { + lprops = ubifs_fast_find_free(c); + if (lprops && lprops->free >= min_space) + return lprops; + } + if (pick_free) { + lprops = ubifs_fast_find_empty(c); + if (lprops) + return lprops; + } + if (!squeeze) { + lprops = ubifs_fast_find_free(c); + if (lprops && lprops->free >= min_space) + return lprops; + } + /* There may be an LEB with enough free space on the dirty heap */ + heap = &c->lpt_heap[LPROPS_DIRTY - 1]; + for (i = 0; i < heap->cnt; i++) { + lprops = heap->arr[i]; + if (lprops->free >= min_space) + return lprops; + } + /* + * A LEB may have fallen off of the bottom of the free heap, and ended + * up as uncategorized even though it has enough free space for us now, + * so check the uncategorized list. N.B. neither empty nor freeable LEBs + * can end up as uncategorized because they are kept on lists not + * finite-sized heaps. + */ + list_for_each_entry(lprops, &c->uncat_list, list) { + if (lprops->flags & LPROPS_TAKEN) + continue; + if (lprops->flags & LPROPS_INDEX) + continue; + if (lprops->free >= min_space) + return lprops; + } + /* We have looked everywhere in main memory, now scan the flash */ + if (c->pnodes_have >= c->pnode_cnt) + /* All pnodes are in memory, so skip scan */ + return ERR_PTR(-ENOSPC); + data.min_space = min_space; + data.pick_free = pick_free; + data.lnum = -1; + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, + (ubifs_lpt_scan_callback)scan_for_free_cb, + &data); + if (err) + return ERR_PTR(err); + ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt); + c->lscan_lnum = data.lnum; + lprops = ubifs_lpt_lookup_dirty(c, data.lnum); + if (IS_ERR(lprops)) + return lprops; + ubifs_assert(lprops->lnum == data.lnum); + ubifs_assert(lprops->free >= min_space); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + return lprops; +} + +/** + * ubifs_find_free_space - find a data LEB with free space. + * @c: the UBIFS file-system description object + * @min_space: minimum amount of required free space + * @offs: contains offset of where free space starts on exit + * @squeeze: whether to try to find space in a non-empty LEB first + * + * This function looks for an LEB with at least @min_space bytes of free space. + * It tries to find an empty LEB if possible. If no empty LEBs are available, + * this function searches for a non-empty data LEB. The returned LEB is marked + * as "taken". + * + * This function returns found LEB number in case of success, %-ENOSPC if it + * failed to find a LEB with @min_space bytes of free space and other a negative + * error codes in case of failure. + */ +int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs, + int squeeze) +{ + const struct ubifs_lprops *lprops; + int lebs, rsvd_idx_lebs, pick_free = 0, err, lnum, flags; + + dbg_find("min_space %d", min_space); + ubifs_get_lprops(c); + + /* Check if there are enough empty LEBs for commit */ + spin_lock(&c->space_lock); + if (c->min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = c->min_idx_lebs - c->lst.idx_lebs; + else + rsvd_idx_lebs = 0; + lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - + c->lst.taken_empty_lebs; + if (rsvd_idx_lebs < lebs) + /* + * OK to allocate an empty LEB, but we still don't want to go + * looking for one if there aren't any. + */ + if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) { + pick_free = 1; + /* + * Because we release the space lock, we must account + * for this allocation here. After the LEB properties + * flags have been updated, we subtract one. Note, the + * result of this is that lprops also decreases + * @taken_empty_lebs in 'ubifs_change_lp()', so it is + * off by one for a short period of time which may + * introduce a small disturbance to budgeting + * calculations, but this is harmless because at the + * worst case this would make the budgeting subsystem + * be more pessimistic than needed. + * + * Fundamentally, this is about serialization of the + * budgeting and lprops subsystems. We could make the + * @space_lock a mutex and avoid dropping it before + * calling 'ubifs_change_lp()', but mutex is more + * heavy-weight, and we want budgeting to be as fast as + * possible. + */ + c->lst.taken_empty_lebs += 1; + } + spin_unlock(&c->space_lock); + + lprops = do_find_free_space(c, min_space, pick_free, squeeze); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + + lnum = lprops->lnum; + flags = lprops->flags | LPROPS_TAKEN; + + lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC, flags, 0); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + + if (pick_free) { + spin_lock(&c->space_lock); + c->lst.taken_empty_lebs -= 1; + spin_unlock(&c->space_lock); + } + + *offs = c->leb_size - lprops->free; + ubifs_release_lprops(c); + + if (*offs == 0) { + /* + * Ensure that empty LEBs have been unmapped. They may not have + * been, for example, because of an unclean unmount. Also + * LEBs that were freeable LEBs (free + dirty == leb_size) will + * not have been unmapped. + */ + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + + dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs); + ubifs_assert(*offs <= c->leb_size - min_space); + return lnum; + +out: + if (pick_free) { + spin_lock(&c->space_lock); + c->lst.taken_empty_lebs -= 1; + spin_unlock(&c->space_lock); + } + ubifs_release_lprops(c); + return err; +} + +/** + * scan_for_idx_cb - callback used by the scan for a free LEB for the index. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @data: information passed to and from the caller of the scan + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_for_idx_cb(struct ubifs_info *c, + const struct ubifs_lprops *lprops, int in_tree, + struct scan_data *data) +{ + int ret = LPT_SCAN_CONTINUE; + + /* Exclude LEBs that are currently in use */ + if (lprops->flags & LPROPS_TAKEN) + return LPT_SCAN_CONTINUE; + /* Determine whether to add these LEB properties to the tree */ + if (!in_tree && valuable(c, lprops)) + ret |= LPT_SCAN_ADD; + /* Exclude index LEBS */ + if (lprops->flags & LPROPS_INDEX) + return ret; + /* Exclude LEBs that cannot be made empty */ + if (lprops->free + lprops->dirty != c->leb_size) + return ret; + /* + * We are allocating for the index so it is safe to allocate LEBs with + * only free and dirty space, because write buffers are sync'd at commit + * start. + */ + data->lnum = lprops->lnum; + return LPT_SCAN_ADD | LPT_SCAN_STOP; +} + +/** + * scan_for_leb_for_idx - scan for a free LEB for the index. + * @c: the UBIFS file-system description object + */ +static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct scan_data data; + int err; + + data.lnum = -1; + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, + (ubifs_lpt_scan_callback)scan_for_idx_cb, + &data); + if (err) + return ERR_PTR(err); + ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt); + c->lscan_lnum = data.lnum; + lprops = ubifs_lpt_lookup_dirty(c, data.lnum); + if (IS_ERR(lprops)) + return lprops; + ubifs_assert(lprops->lnum == data.lnum); + ubifs_assert(lprops->free + lprops->dirty == c->leb_size); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + return lprops; +} + +/** + * ubifs_find_free_leb_for_idx - find a free LEB for the index. + * @c: the UBIFS file-system description object + * + * This function looks for a free LEB and returns that LEB number. The returned + * LEB is marked as "taken", "index". + * + * Only empty LEBs are allocated. This is for two reasons. First, the commit + * calculates the number of LEBs to allocate based on the assumption that they + * will be empty. Secondly, free space at the end of an index LEB is not + * guaranteed to be empty because it may have been used by the in-the-gaps + * method prior to an unclean unmount. + * + * If no LEB is found %-ENOSPC is returned. For other failures another negative + * error code is returned. + */ +int ubifs_find_free_leb_for_idx(struct ubifs_info *c) +{ + const struct ubifs_lprops *lprops; + int lnum = -1, err, flags; + + ubifs_get_lprops(c); + + lprops = ubifs_fast_find_empty(c); + if (!lprops) { + lprops = ubifs_fast_find_freeable(c); + if (!lprops) { + ubifs_assert(c->freeable_cnt == 0); + if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) { + lprops = scan_for_leb_for_idx(c); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + } + } + } + + if (!lprops) { + err = -ENOSPC; + goto out; + } + + lnum = lprops->lnum; + + dbg_find("found LEB %d, free %d, dirty %d, flags %#x", + lnum, lprops->free, lprops->dirty, lprops->flags); + + flags = lprops->flags | LPROPS_TAKEN | LPROPS_INDEX; + lprops = ubifs_change_lp(c, lprops, c->leb_size, 0, flags, 0); + if (IS_ERR(lprops)) { + err = PTR_ERR(lprops); + goto out; + } + + ubifs_release_lprops(c); + + /* + * Ensure that empty LEBs have been unmapped. They may not have been, + * for example, because of an unclean unmount. Also LEBs that were + * freeable LEBs (free + dirty == leb_size) will not have been unmapped. + */ + err = ubifs_leb_unmap(c, lnum); + if (err) { + ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_TAKEN | LPROPS_INDEX, 0); + return err; + } + + return lnum; + +out: + ubifs_release_lprops(c); + return err; +} + +static int cmp_dirty_idx(const struct ubifs_lprops **a, + const struct ubifs_lprops **b) +{ + const struct ubifs_lprops *lpa = *a; + const struct ubifs_lprops *lpb = *b; + + return lpa->dirty + lpa->free - lpb->dirty - lpb->free; +} + +static void swap_dirty_idx(struct ubifs_lprops **a, struct ubifs_lprops **b, + int size) +{ + struct ubifs_lprops *t = *a; + + *a = *b; + *b = t; +} + +/** + * ubifs_save_dirty_idx_lnums - save an array of the most dirty index LEB nos. + * @c: the UBIFS file-system description object + * + * This function is called each commit to create an array of LEB numbers of + * dirty index LEBs sorted in order of dirty and free space. This is used by + * the in-the-gaps method of TNC commit. + */ +int ubifs_save_dirty_idx_lnums(struct ubifs_info *c) +{ + int i; + + ubifs_get_lprops(c); + /* Copy the LPROPS_DIRTY_IDX heap */ + c->dirty_idx.cnt = c->lpt_heap[LPROPS_DIRTY_IDX - 1].cnt; + memcpy(c->dirty_idx.arr, c->lpt_heap[LPROPS_DIRTY_IDX - 1].arr, + sizeof(void *) * c->dirty_idx.cnt); + /* Sort it so that the dirtiest is now at the end */ + sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *), + (int (*)(const void *, const void *))cmp_dirty_idx, + (void (*)(void *, void *, int))swap_dirty_idx); + dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt); + if (c->dirty_idx.cnt) + dbg_find("dirtiest index LEB is %d with dirty %d and free %d", + c->dirty_idx.arr[c->dirty_idx.cnt - 1]->lnum, + c->dirty_idx.arr[c->dirty_idx.cnt - 1]->dirty, + c->dirty_idx.arr[c->dirty_idx.cnt - 1]->free); + /* Replace the lprops pointers with LEB numbers */ + for (i = 0; i < c->dirty_idx.cnt; i++) + c->dirty_idx.arr[i] = (void *)(size_t)c->dirty_idx.arr[i]->lnum; + ubifs_release_lprops(c); + return 0; +} + +/** + * scan_dirty_idx_cb - callback used by the scan for a dirty index LEB. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @data: information passed to and from the caller of the scan + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_dirty_idx_cb(struct ubifs_info *c, + const struct ubifs_lprops *lprops, int in_tree, + struct scan_data *data) +{ + int ret = LPT_SCAN_CONTINUE; + + /* Exclude LEBs that are currently in use */ + if (lprops->flags & LPROPS_TAKEN) + return LPT_SCAN_CONTINUE; + /* Determine whether to add these LEB properties to the tree */ + if (!in_tree && valuable(c, lprops)) + ret |= LPT_SCAN_ADD; + /* Exclude non-index LEBs */ + if (!(lprops->flags & LPROPS_INDEX)) + return ret; + /* Exclude LEBs with too little space */ + if (lprops->free + lprops->dirty < c->min_idx_node_sz) + return ret; + /* Finally we found space */ + data->lnum = lprops->lnum; + return LPT_SCAN_ADD | LPT_SCAN_STOP; +} + +/** + * find_dirty_idx_leb - find a dirty index LEB. + * @c: the UBIFS file-system description object + * + * This function returns LEB number upon success and a negative error code upon + * failure. In particular, -ENOSPC is returned if a dirty index LEB is not + * found. + * + * Note that this function scans the entire LPT but it is called very rarely. + */ +static int find_dirty_idx_leb(struct ubifs_info *c) +{ + const struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + struct scan_data data; + int err, i, ret; + + /* Check all structures in memory first */ + data.lnum = -1; + heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; + for (i = 0; i < heap->cnt; i++) { + lprops = heap->arr[i]; + ret = scan_dirty_idx_cb(c, lprops, 1, &data); + if (ret & LPT_SCAN_STOP) + goto found; + } + list_for_each_entry(lprops, &c->frdi_idx_list, list) { + ret = scan_dirty_idx_cb(c, lprops, 1, &data); + if (ret & LPT_SCAN_STOP) + goto found; + } + list_for_each_entry(lprops, &c->uncat_list, list) { + ret = scan_dirty_idx_cb(c, lprops, 1, &data); + if (ret & LPT_SCAN_STOP) + goto found; + } + if (c->pnodes_have >= c->pnode_cnt) + /* All pnodes are in memory, so skip scan */ + return -ENOSPC; + err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, + (ubifs_lpt_scan_callback)scan_dirty_idx_cb, + &data); + if (err) + return err; +found: + ubifs_assert(data.lnum >= c->main_first && data.lnum < c->leb_cnt); + c->lscan_lnum = data.lnum; + lprops = ubifs_lpt_lookup_dirty(c, data.lnum); + if (IS_ERR(lprops)) + return PTR_ERR(lprops); + ubifs_assert(lprops->lnum == data.lnum); + ubifs_assert(lprops->free + lprops->dirty >= c->min_idx_node_sz); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert((lprops->flags & LPROPS_INDEX)); + + dbg_find("found dirty LEB %d, free %d, dirty %d, flags %#x", + lprops->lnum, lprops->free, lprops->dirty, lprops->flags); + + lprops = ubifs_change_lp(c, lprops, LPROPS_NC, LPROPS_NC, + lprops->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lprops)) + return PTR_ERR(lprops); + + return lprops->lnum; +} + +/** + * get_idx_gc_leb - try to get a LEB number from trivial GC. + * @c: the UBIFS file-system description object + */ +static int get_idx_gc_leb(struct ubifs_info *c) +{ + const struct ubifs_lprops *lp; + int err, lnum; + + err = ubifs_get_idx_gc_leb(c); + if (err < 0) + return err; + lnum = err; + /* + * The LEB was due to be unmapped after the commit but + * it is needed now for this commit. + */ + lp = ubifs_lpt_lookup_dirty(c, lnum); + if (IS_ERR(lp)) + return PTR_ERR(lp); + lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, + lp->flags | LPROPS_INDEX, -1); + if (IS_ERR(lp)) + return PTR_ERR(lp); + dbg_find("LEB %d, dirty %d and free %d flags %#x", + lp->lnum, lp->dirty, lp->free, lp->flags); + return lnum; +} + +/** + * find_dirtiest_idx_leb - find dirtiest index LEB from dirtiest array. + * @c: the UBIFS file-system description object + */ +static int find_dirtiest_idx_leb(struct ubifs_info *c) +{ + const struct ubifs_lprops *lp; + int lnum; + + while (1) { + if (!c->dirty_idx.cnt) + return -ENOSPC; + /* The lprops pointers were replaced by LEB numbers */ + lnum = (size_t)c->dirty_idx.arr[--c->dirty_idx.cnt]; + lp = ubifs_lpt_lookup(c, lnum); + if (IS_ERR(lp)) + return PTR_ERR(lp); + if ((lp->flags & LPROPS_TAKEN) || !(lp->flags & LPROPS_INDEX)) + continue; + lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, + lp->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lp)) + return PTR_ERR(lp); + break; + } + dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty, + lp->free, lp->flags); + ubifs_assert(lp->flags | LPROPS_TAKEN); + ubifs_assert(lp->flags | LPROPS_INDEX); + return lnum; +} + +/** + * ubifs_find_dirty_idx_leb - try to find dirtiest index LEB as at last commit. + * @c: the UBIFS file-system description object + * + * This function attempts to find an untaken index LEB with the most free and + * dirty space that can be used without overwriting index nodes that were in the + * last index committed. + */ +int ubifs_find_dirty_idx_leb(struct ubifs_info *c) +{ + int err; + + ubifs_get_lprops(c); + + /* + * We made an array of the dirtiest index LEB numbers as at the start of + * last commit. Try that array first. + */ + err = find_dirtiest_idx_leb(c); + + /* Next try scanning the entire LPT */ + if (err == -ENOSPC) + err = find_dirty_idx_leb(c); + + /* Finally take any index LEBs awaiting trivial GC */ + if (err == -ENOSPC) + err = get_idx_gc_leb(c); + + ubifs_release_lprops(c); + return err; +} diff -Nurd linux-2.6.24/fs/ubifs/gc.c ubifs-v2.6.24/fs/ubifs/gc.c --- linux-2.6.24/fs/ubifs/gc.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/gc.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,1033 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements garbage collection. The procedure for garbage collection + * is different depending on whether a LEB as an index LEB (contains index + * nodes) or not. For non-index LEBs, garbage collection finds a LEB which + * contains a lot of dirty space (obsolete nodes), and copies the non-obsolete + * nodes to the journal, at which point the garbage-collected LEB is free to be + * reused. For index LEBs, garbage collection marks the non-obsolete index nodes + * dirty in the TNC, and after the next commit, the garbage-collected LEB is + * to be reused. Garbage collection will cause the number of dirty index nodes + * to grow, however sufficient space is reserved for the index to ensure the + * commit will never run out of space. + * + * Notes about dead watermark. At current UBIFS implementation we assume that + * LEBs which have less than @c->dead_wm bytes of free + dirty space are full + * and not worth garbage-collecting. The dead watermark is one min. I/O unit + * size, or min. UBIFS node size, depending on what is greater. Indeed, UBIFS + * Garbage Collector has to synchronize the GC head's write buffer before + * returning, so this is about wasting one min. I/O unit. However, UBIFS GC can + * actually reclaim even very small pieces of dirty space by garbage collecting + * enough dirty LEBs, but we do not bother doing this at this implementation. + * + * Notes about dark watermark. The results of GC work depends on how big are + * the UBIFS nodes GC deals with. Large nodes make GC waste more space. Indeed, + * if GC move data from LEB A to LEB B and nodes in LEB A are large, GC would + * have to waste large pieces of free space at the end of LEB B, because nodes + * from LEB A would not fit. And the worst situation is when all nodes are of + * maximum size. So dark watermark is the amount of free + dirty space in LEB + * which are guaranteed to be reclaimable. If LEB has less space, the GC might + * be unable to reclaim it. So, LEBs with free + dirty greater than dark + * watermark are "good" LEBs from GC's point of few. The other LEBs are not so + * good, and GC takes extra care when moving them. + */ + +#include +#include "ubifs.h" + +/* + * GC may need to move more then one LEB to make progress. The below constants + * define "soft" and "hard" limits on the number of LEBs the garbage collector + * may move. + */ +#define SOFT_LEBS_LIMIT 4 +#define HARD_LEBS_LIMIT 32 + +/** + * switch_gc_head - switch the garbage collection journal head. + * @c: UBIFS file-system description object + * @buf: buffer to write + * @len: length of the buffer to write + * @lnum: LEB number written is returned here + * @offs: offset written is returned here + * + * This function switch the GC head to the next LEB which is reserved in + * @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required, + * and other negative error code in case of failures. + */ +static int switch_gc_head(struct ubifs_info *c) +{ + int err, gc_lnum = c->gc_lnum; + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + + ubifs_assert(gc_lnum != -1); + dbg_gc("switch GC head from LEB %d:%d to LEB %d (waste %d bytes)", + wbuf->lnum, wbuf->offs + wbuf->used, gc_lnum, + c->leb_size - wbuf->offs - wbuf->used); + + err = ubifs_wbuf_sync_nolock(wbuf); + if (err) + return err; + + /* + * The GC write-buffer was synchronized, we may safely unmap + * 'c->gc_lnum'. + */ + err = ubifs_leb_unmap(c, gc_lnum); + if (err) + return err; + + err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0); + if (err) + return err; + + c->gc_lnum = -1; + err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0, UBI_LONGTERM); + return err; +} + +/** + * list_sort - sort a list. + * @priv: private data, passed to @cmp + * @head: the list to sort + * @cmp: the elements comparison function + * + * This function has been implemented by Mark J Roberts . It + * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted + * in ascending order. + * + * The comparison function @cmp is supposed to return a negative value if @a is + * than @b, and a positive value if @a is greater than @b. If @a and @b are + * equivalent, then it does not matter what this function returns. + */ +static void list_sort(void *priv, struct list_head *head, + int (*cmp)(void *priv, struct list_head *a, + struct list_head *b)) +{ + struct list_head *p, *q, *e, *list, *tail, *oldhead; + int insize, nmerges, psize, qsize, i; + + if (list_empty(head)) + return; + + list = head->next; + list_del(head); + insize = 1; + for (;;) { + p = oldhead = list; + list = tail = NULL; + nmerges = 0; + + while (p) { + nmerges++; + q = p; + psize = 0; + for (i = 0; i < insize; i++) { + psize++; + q = q->next == oldhead ? NULL : q->next; + if (!q) + break; + } + + qsize = insize; + while (psize > 0 || (qsize > 0 && q)) { + if (!psize) { + e = q; + q = q->next; + qsize--; + if (q == oldhead) + q = NULL; + } else if (!qsize || !q) { + e = p; + p = p->next; + psize--; + if (p == oldhead) + p = NULL; + } else if (cmp(priv, p, q) <= 0) { + e = p; + p = p->next; + psize--; + if (p == oldhead) + p = NULL; + } else { + e = q; + q = q->next; + qsize--; + if (q == oldhead) + q = NULL; + } + if (tail) + tail->next = e; + else + list = e; + e->prev = tail; + tail = e; + } + p = q; + } + + tail->next = list; + list->prev = tail; + + if (nmerges <= 1) + break; + + insize *= 2; + } + + head->next = list; + head->prev = list->prev; + list->prev->next = head; + list->prev = head; +} + +/** + * data_nodes_cmp - compare 2 data nodes. + * @priv: UBIFS file-system description object + * @a: first data node + * @a: second data node + * + * This function compares data nodes @a and @b. Returns %1 if @a has greater + * inode or block number, and %-1 otherwise. + */ +int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + ino_t inuma, inumb; + struct ubifs_info *c = priv; + struct ubifs_scan_node *sa, *sb; + + cond_resched(); + sa = list_entry(a, struct ubifs_scan_node, list); + sb = list_entry(b, struct ubifs_scan_node, list); + ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY); + ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY); + + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma == inumb) { + unsigned int blka = key_block(c, &sa->key); + unsigned int blkb = key_block(c, &sb->key); + + if (blka <= blkb) + return -1; + } else if (inuma <= inumb) + return -1; + + return 1; +} + +/* + * nondata_nodes_cmp - compare 2 non-data nodes. + * @priv: UBIFS file-system description object + * @a: first node + * @a: second node + * + * This function compares nodes @a and @b. It makes sure that inode nodes go + * first and sorted by length in descending order. Directory entry nodes go + * after inode nodes and are sorted in ascending hash valuer order. + */ +int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + int typea, typeb; + ino_t inuma, inumb; + struct ubifs_info *c = priv; + struct ubifs_scan_node *sa, *sb; + + cond_resched(); + sa = list_entry(a, struct ubifs_scan_node, list); + sb = list_entry(b, struct ubifs_scan_node, list); + typea = key_type(c, &sa->key); + typeb = key_type(c, &sb->key); + ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY); + + /* Inodes go before directory entries */ + if (typea == UBIFS_INO_KEY) { + if (typeb == UBIFS_INO_KEY) + return sb->len - sa->len; + return -1; + } + if (typeb == UBIFS_INO_KEY) + return 1; + + ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY); + inuma = key_inum(c, &sa->key); + inumb = key_inum(c, &sb->key); + + if (inuma == inumb) { + uint32_t hasha = key_hash(c, &sa->key); + uint32_t hashb = key_hash(c, &sb->key); + + if (hasha <= hashb) + return -1; + } else if (inuma <= inumb) + return -1; + + return 1; +} + +/** + * sort_nodes - sort nodes for GC. + * @c: UBIFS file-system description object + * @sleb: describes nodes to sort and contains the result on exit + * @nondata: contains non-data nodes on exit + * @min: minimum node size is returned here + * + * This function sorts the list of inodes to garbage collect. First of all, it + * kills obsolete nodes and separates data and non-data nodes to the + * @sleb->nodes and @nondata lists correspondingly. + * + * Data nodes are then sorted in block number order - this is important for + * bulk-read; data nodes with lower inode number go before data nodes with + * higher inode number, and data nodes with lower block number go before data + * nodes with higher block number; + * + * Non-data nodes are sorted as follows. + * o First go inode nodes - they are sorted in descending length order. + * o Then go directory entry nodes - they are sorted in hash order, which + * should supposedly optimize 'readdir()'. Direntry nodes with lower parent + * inode number go before direntry nodes with higher parent inode number, + * and direntry nodes with lower name hash values go before direntry nodes + * with higher name hash values. + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + struct list_head *nondata, int *min) +{ + struct ubifs_scan_node *snod, *tmp; + + *min = INT_MAX; + + /* Separate data nodes and non-data nodes */ + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { + int err; + + ubifs_assert(snod->type != UBIFS_IDX_NODE); + ubifs_assert(snod->type != UBIFS_REF_NODE); + ubifs_assert(snod->type != UBIFS_CS_NODE); + + err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, + snod->offs, 0); + if (err < 0) + return err; + + if (!err) { + /* The node is obsolete, remove it from the list */ + list_del(&snod->list); + kfree(snod); + continue; + } + + if (snod->len < *min) + *min = snod->len; + + if (key_type(c, &snod->key) != UBIFS_DATA_KEY) + list_move_tail(&snod->list, nondata); + } + + /* Sort data and non-data nodes */ + list_sort(c, &sleb->nodes, &data_nodes_cmp); + list_sort(c, nondata, &nondata_nodes_cmp); + return 0; +} + +/** + * move_node - move a node. + * @c: UBIFS file-system description object + * @sleb: describes the LEB to move nodes from + * @snod: the mode to move + * @wbuf: write-buffer to move node to + * + * This function moves node @snod to @wbuf, changes TNC correspondingly, and + * destroys @snod. Returns zero in case of success and a negative error code in + * case of failure. + */ +static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf) +{ + int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used; + + cond_resched(); + err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len); + if (err) + return err; + + err = ubifs_tnc_replace(c, &snod->key, sleb->lnum, + snod->offs, new_lnum, new_offs, + snod->len); + list_del(&snod->list); + kfree(snod); + return err; +} + +/** + * move_nodes - move nodes. + * @c: UBIFS file-system description object + * @sleb: describes the LEB to move nodes from + * + * This function moves valid nodes from data LEB described by @sleb to the GC + * journal head. This function returns zero in case of success, %-EAGAIN if + * commit is required, and other negative error codes in case of other + * failures. + */ +static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) +{ + int err, min; + LIST_HEAD(nondata); + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + + if (wbuf->lnum == -1) { + /* + * The GC journal head is not set, because it is the first GC + * invocation since mount. + */ + err = switch_gc_head(c); + if (err) + return err; + } + + err = sort_nodes(c, sleb, &nondata, &min); + if (err) + goto out; + + /* Write nodes to their new location. Use the first-fit strategy */ + while (1) { + int avail; + struct ubifs_scan_node *snod, *tmp; + + /* Move data nodes */ + list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { + avail = c->leb_size - wbuf->offs - wbuf->used; + if (snod->len > avail) + /* + * Do not skip data nodes in order to optimize + * bulk-read. + */ + break; + + err = move_node(c, sleb, snod, wbuf); + if (err) + goto out; + } + + /* Move non-data nodes */ + list_for_each_entry_safe(snod, tmp, &nondata, list) { + avail = c->leb_size - wbuf->offs - wbuf->used; + if (avail < min) + break; + + if (snod->len > avail) { + /* + * Keep going only if this is an inode with + * some data. Otherwise stop and switch the GC + * head. IOW, we assume that data-less inode + * nodes and direntry nodes are roughly of the + * same size. + */ + if (key_type(c, &snod->key) == UBIFS_DENT_KEY || + snod->len == UBIFS_INO_NODE_SZ) + break; + continue; + } + + err = move_node(c, sleb, snod, wbuf); + if (err) + goto out; + } + + if (list_empty(&sleb->nodes) && list_empty(&nondata)) + break; + + /* + * Waste the rest of the space in the LEB and switch to the + * next LEB. + */ + err = switch_gc_head(c); + if (err) + goto out; + } + + return 0; + +out: + list_splice_tail(&nondata, &sleb->nodes); + return err; +} + +/** + * gc_sync_wbufs - sync write-buffers for GC. + * @c: UBIFS file-system description object + * + * We must guarantee that obsoleting nodes are on flash. Unfortunately they may + * be in a write-buffer instead. That is, a node could be written to a + * write-buffer, obsoleting another node in a LEB that is GC'd. If that LEB is + * erased before the write-buffer is sync'd and then there is an unclean + * unmount, then an existing node is lost. To avoid this, we sync all + * write-buffers. + * + * This function returns %0 on success or a negative error code on failure. + */ +static int gc_sync_wbufs(struct ubifs_info *c) +{ + int err, i; + + for (i = 0; i < c->jhead_cnt; i++) { + if (i == GCHD) + continue; + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + return err; + } + return 0; +} + +/** + * ubifs_garbage_collect_leb - garbage-collect a logical eraseblock. + * @c: UBIFS file-system description object + * @lp: describes the LEB to garbage collect + * + * This function garbage-collects an LEB and returns one of the @LEB_FREED, + * @LEB_RETAINED, etc positive codes in case of success, %-EAGAIN if commit is + * required, and other negative error codes in case of failures. + */ +int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + int err = 0, lnum = lp->lnum; + + ubifs_assert(c->gc_lnum != -1 || wbuf->offs + wbuf->used == 0 || + c->need_recovery); + ubifs_assert(c->gc_lnum != lnum); + ubifs_assert(wbuf->lnum != lnum); + + /* + * We scan the entire LEB even though we only really need to scan up to + * (c->leb_size - lp->free). + */ + sleb = ubifs_scan(c, lnum, 0, c->sbuf); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + + ubifs_assert(!list_empty(&sleb->nodes)); + snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list); + + if (snod->type == UBIFS_IDX_NODE) { + struct ubifs_gced_idx_leb *idx_gc; + + dbg_gc("indexing LEB %d (free %d, dirty %d)", + lnum, lp->free, lp->dirty); + list_for_each_entry(snod, &sleb->nodes, list) { + struct ubifs_idx_node *idx = snod->node; + int level = le16_to_cpu(idx->level); + + ubifs_assert(snod->type == UBIFS_IDX_NODE); + key_read(c, ubifs_idx_key(c, idx), &snod->key); + err = ubifs_dirty_idx_node(c, &snod->key, level, lnum, + snod->offs); + if (err) + goto out; + } + + idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS); + if (!idx_gc) { + err = -ENOMEM; + goto out; + } + + idx_gc->lnum = lnum; + idx_gc->unmap = 0; + list_add(&idx_gc->list, &c->idx_gc); + + /* + * Don't release the LEB until after the next commit, because + * it may contain data which is needed for recovery. So + * although we freed this LEB, it will become usable only after + * the commit. + */ + err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, + LPROPS_INDEX, 1); + if (err) + goto out; + err = LEB_FREED_IDX; + } else { + dbg_gc("data LEB %d (free %d, dirty %d)", + lnum, lp->free, lp->dirty); + + err = move_nodes(c, sleb); + if (err) + goto out_inc_seq; + + err = gc_sync_wbufs(c); + if (err) + goto out_inc_seq; + + err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0); + if (err) + goto out_inc_seq; + + /* Allow for races with TNC */ + c->gced_lnum = lnum; + smp_wmb(); + c->gc_seq += 1; + smp_wmb(); + + if (c->gc_lnum == -1) { + c->gc_lnum = lnum; + err = LEB_RETAINED; + } else { + err = ubifs_wbuf_sync_nolock(wbuf); + if (err) + goto out; + + err = ubifs_leb_unmap(c, lnum); + if (err) + goto out; + + err = LEB_FREED; + } + } + +out: + ubifs_scan_destroy(sleb); + return err; + +out_inc_seq: + /* We may have moved at least some nodes so allow for races with TNC */ + c->gced_lnum = lnum; + smp_wmb(); + c->gc_seq += 1; + smp_wmb(); + goto out; +} + +/** + * ubifs_garbage_collect - UBIFS garbage collector. + * @c: UBIFS file-system description object + * @anyway: do GC even if there are free LEBs + * + * This function does out-of-place garbage collection. The return codes are: + * o positive LEB number if the LEB has been freed and may be used; + * o %-EAGAIN if the caller has to run commit; + * o %-ENOSPC if GC failed to make any progress; + * o other negative error codes in case of other errors. + * + * Garbage collector writes data to the journal when GC'ing data LEBs, and just + * marking indexing nodes dirty when GC'ing indexing LEBs. Thus, at some point + * commit may be required. But commit cannot be run from inside GC, because the + * caller might be holding the commit lock, so %-EAGAIN is returned instead; + * And this error code means that the caller has to run commit, and re-run GC + * if there is still no free space. + * + * There are many reasons why this function may return %-EAGAIN: + * o the log is full and there is no space to write an LEB reference for + * @c->gc_lnum; + * o the journal is too large and exceeds size limitations; + * o GC moved indexing LEBs, but they can be used only after the commit; + * o the shrinker fails to find clean znodes to free and requests the commit; + * o etc. + * + * Note, if the file-system is close to be full, this function may return + * %-EAGAIN infinitely, so the caller has to limit amount of re-invocations of + * the function. E.g., this happens if the limits on the journal size are too + * tough and GC writes too much to the journal before an LEB is freed. This + * might also mean that the journal is too large, and the TNC becomes to big, + * so that the shrinker is constantly called, finds not clean znodes to free, + * and requests commit. Well, this may also happen if the journal is all right, + * but another kernel process consumes too much memory. Anyway, infinite + * %-EAGAIN may happen, but in some extreme/misconfiguration cases. + */ +int ubifs_garbage_collect(struct ubifs_info *c, int anyway) +{ + int i, err, ret, min_space = c->dead_wm; + struct ubifs_lprops lp; + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + + ubifs_assert_cmt_locked(c); + + if (ubifs_gc_should_commit(c)) + return -EAGAIN; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + + if (c->ro_media) { + ret = -EROFS; + goto out_unlock; + } + + /* We expect the write-buffer to be empty on entry */ + ubifs_assert(!wbuf->used); + + for (i = 0; ; i++) { + int space_before = c->leb_size - wbuf->offs - wbuf->used; + int space_after; + + cond_resched(); + + /* Give the commit an opportunity to run */ + if (ubifs_gc_should_commit(c)) { + ret = -EAGAIN; + break; + } + + if (i > SOFT_LEBS_LIMIT && !list_empty(&c->idx_gc)) { + /* + * We've done enough iterations. Indexing LEBs were + * moved and will be available after the commit. + */ + dbg_gc("soft limit, some index LEBs GC'ed, -EAGAIN"); + ubifs_commit_required(c); + ret = -EAGAIN; + break; + } + + if (i > HARD_LEBS_LIMIT) { + /* + * We've moved too many LEBs and have not made + * progress, give up. + */ + dbg_gc("hard limit, -ENOSPC"); + ret = -ENOSPC; + break; + } + + /* + * Empty and freeable LEBs can turn up while we waited for + * the wbuf lock, or while we have been running GC. In that + * case, we should just return one of those instead of + * continuing to GC dirty LEBs. Hence we request + * 'ubifs_find_dirty_leb()' to return an empty LEB if it can. + */ + ret = ubifs_find_dirty_leb(c, &lp, min_space, anyway ? 0 : 1); + if (ret) { + if (ret == -ENOSPC) + dbg_gc("no more dirty LEBs"); + break; + } + + dbg_gc("found LEB %d: free %d, dirty %d, sum %d " + "(min. space %d)", lp.lnum, lp.free, lp.dirty, + lp.free + lp.dirty, min_space); + + if (lp.free + lp.dirty == c->leb_size) { + /* An empty LEB was returned */ + dbg_gc("LEB %d is free, return it", lp.lnum); + /* + * ubifs_find_dirty_leb() doesn't return freeable index + * LEBs. + */ + ubifs_assert(!(lp.flags & LPROPS_INDEX)); + if (lp.free != c->leb_size) { + /* + * Write buffers must be sync'd before + * unmapping freeable LEBs, because one of them + * may contain data which obsoletes something + * in 'lp.pnum'. + */ + ret = gc_sync_wbufs(c); + if (ret) + goto out; + ret = ubifs_change_one_lp(c, lp.lnum, + c->leb_size, 0, 0, 0, + 0); + if (ret) + goto out; + } + ret = ubifs_leb_unmap(c, lp.lnum); + if (ret) + goto out; + ret = lp.lnum; + break; + } + + space_before = c->leb_size - wbuf->offs - wbuf->used; + if (wbuf->lnum == -1) + space_before = 0; + + ret = ubifs_garbage_collect_leb(c, &lp); + if (ret < 0) { + if (ret == -EAGAIN || ret == -ENOSPC) { + /* + * These codes are not errors, so we have to + * return the LEB to lprops. But if the + * 'ubifs_return_leb()' function fails, its + * failure code is propagated to the caller + * instead of the original '-EAGAIN' or + * '-ENOSPC'. + */ + err = ubifs_return_leb(c, lp.lnum); + if (err) + ret = err; + break; + } + goto out; + } + + if (ret == LEB_FREED) { + /* An LEB has been freed and is ready for use */ + dbg_gc("LEB %d freed, return", lp.lnum); + ret = lp.lnum; + break; + } + + if (ret == LEB_FREED_IDX) { + /* + * This was an indexing LEB and it cannot be + * immediately used. And instead of requesting the + * commit straight away, we try to garbage collect some + * more. + */ + dbg_gc("indexing LEB %d freed, continue", lp.lnum); + continue; + } + + ubifs_assert(ret == LEB_RETAINED); + space_after = c->leb_size - wbuf->offs - wbuf->used; + dbg_gc("LEB %d retained, freed %d bytes", lp.lnum, + space_after - space_before); + + if (space_after > space_before) { + /* GC makes progress, keep working */ + min_space >>= 1; + if (min_space < c->dead_wm) + min_space = c->dead_wm; + continue; + } + + dbg_gc("did not make progress"); + + /* + * GC moved an LEB bud have not done any progress. This means + * that the previous GC head LEB contained too few free space + * and the LEB which was GC'ed contained only large nodes which + * did not fit that space. + * + * We can do 2 things: + * 1. pick another LEB in a hope it'll contain a small node + * which will fit the space we have at the end of current GC + * head LEB, but there is no guarantee, so we try this out + * unless we have already been working for too long; + * 2. request an LEB with more dirty space, which will force + * 'ubifs_find_dirty_leb()' to start scanning the lprops + * table, instead of just picking one from the heap + * (previously it already picked the dirtiest LEB). + */ + if (i < SOFT_LEBS_LIMIT) { + dbg_gc("try again"); + continue; + } + + min_space <<= 1; + if (min_space > c->dark_wm) + min_space = c->dark_wm; + dbg_gc("set min. space to %d", min_space); + } + + if (ret == -ENOSPC && !list_empty(&c->idx_gc)) { + dbg_gc("no space, some index LEBs GC'ed, -EAGAIN"); + ubifs_commit_required(c); + ret = -EAGAIN; + } + + err = ubifs_wbuf_sync_nolock(wbuf); + if (!err) + err = ubifs_leb_unmap(c, c->gc_lnum); + if (err) { + ret = err; + goto out; + } +out_unlock: + mutex_unlock(&wbuf->io_mutex); + return ret; + +out: + ubifs_assert(ret < 0); + ubifs_assert(ret != -ENOSPC && ret != -EAGAIN); + ubifs_ro_mode(c, ret); + ubifs_wbuf_sync_nolock(wbuf); + mutex_unlock(&wbuf->io_mutex); + ubifs_return_leb(c, lp.lnum); + return ret; +} + +/** + * ubifs_gc_start_commit - garbage collection at start of commit. + * @c: UBIFS file-system description object + * + * If a LEB has only dirty and free space, then we may safely unmap it and make + * it free. Note, we cannot do this with indexing LEBs because dirty space may + * correspond index nodes that are required for recovery. In that case, the + * LEB cannot be unmapped until after the next commit. + * + * This function returns %0 upon success and a negative error code upon failure. + */ +int ubifs_gc_start_commit(struct ubifs_info *c) +{ + struct ubifs_gced_idx_leb *idx_gc; + const struct ubifs_lprops *lp; + int err = 0, flags; + + ubifs_get_lprops(c); + + /* + * Unmap (non-index) freeable LEBs. Note that recovery requires that all + * wbufs are sync'd before this, which is done in 'do_commit()'. + */ + while (1) { + lp = ubifs_fast_find_freeable(c); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + if (!lp) + break; + ubifs_assert(!(lp->flags & LPROPS_TAKEN)); + ubifs_assert(!(lp->flags & LPROPS_INDEX)); + err = ubifs_leb_unmap(c, lp->lnum); + if (err) + goto out; + lp = ubifs_change_lp(c, lp, c->leb_size, 0, lp->flags, 0); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + ubifs_assert(!(lp->flags & LPROPS_TAKEN)); + ubifs_assert(!(lp->flags & LPROPS_INDEX)); + } + + /* Mark GC'd index LEBs OK to unmap after this commit finishes */ + list_for_each_entry(idx_gc, &c->idx_gc, list) + idx_gc->unmap = 1; + + /* Record index freeable LEBs for unmapping after commit */ + while (1) { + lp = ubifs_fast_find_frdi_idx(c); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + if (!lp) + break; + idx_gc = kmalloc(sizeof(struct ubifs_gced_idx_leb), GFP_NOFS); + if (!idx_gc) { + err = -ENOMEM; + goto out; + } + ubifs_assert(!(lp->flags & LPROPS_TAKEN)); + ubifs_assert(lp->flags & LPROPS_INDEX); + /* Don't release the LEB until after the next commit */ + flags = (lp->flags | LPROPS_TAKEN) ^ LPROPS_INDEX; + lp = ubifs_change_lp(c, lp, c->leb_size, 0, flags, 1); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + kfree(idx_gc); + goto out; + } + ubifs_assert(lp->flags & LPROPS_TAKEN); + ubifs_assert(!(lp->flags & LPROPS_INDEX)); + idx_gc->lnum = lp->lnum; + idx_gc->unmap = 1; + list_add(&idx_gc->list, &c->idx_gc); + } +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_gc_end_commit - garbage collection at end of commit. + * @c: UBIFS file-system description object + * + * This function completes out-of-place garbage collection of index LEBs. + */ +int ubifs_gc_end_commit(struct ubifs_info *c) +{ + struct ubifs_gced_idx_leb *idx_gc, *tmp; + struct ubifs_wbuf *wbuf; + int err = 0; + + wbuf = &c->jheads[GCHD].wbuf; + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + list_for_each_entry_safe(idx_gc, tmp, &c->idx_gc, list) + if (idx_gc->unmap) { + dbg_gc("LEB %d", idx_gc->lnum); + err = ubifs_leb_unmap(c, idx_gc->lnum); + if (err) + goto out; + err = ubifs_change_one_lp(c, idx_gc->lnum, LPROPS_NC, + LPROPS_NC, 0, LPROPS_TAKEN, -1); + if (err) + goto out; + list_del(&idx_gc->list); + kfree(idx_gc); + } +out: + mutex_unlock(&wbuf->io_mutex); + return err; +} + +/** + * ubifs_destroy_idx_gc - destroy idx_gc list. + * @c: UBIFS file-system description object + * + * This function destroys the @c->idx_gc list. It is called when unmounting + * so locks are not needed. Returns zero in case of success and a negative + * error code in case of failure. + */ +void ubifs_destroy_idx_gc(struct ubifs_info *c) +{ + while (!list_empty(&c->idx_gc)) { + struct ubifs_gced_idx_leb *idx_gc; + + idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, + list); + c->idx_gc_cnt -= 1; + list_del(&idx_gc->list); + kfree(idx_gc); + } +} + +/** + * ubifs_get_idx_gc_leb - get a LEB from GC'd index LEB list. + * @c: UBIFS file-system description object + * + * Called during start commit so locks are not needed. + */ +int ubifs_get_idx_gc_leb(struct ubifs_info *c) +{ + struct ubifs_gced_idx_leb *idx_gc; + int lnum; + + if (list_empty(&c->idx_gc)) + return -ENOSPC; + idx_gc = list_entry(c->idx_gc.next, struct ubifs_gced_idx_leb, list); + lnum = idx_gc->lnum; + /* c->idx_gc_cnt is updated by the caller when lprops are updated */ + list_del(&idx_gc->list); + kfree(idx_gc); + return lnum; +} diff -Nurd linux-2.6.24/fs/ubifs/io.c ubifs-v2.6.24/fs/ubifs/io.c --- linux-2.6.24/fs/ubifs/io.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/io.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,940 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * Copyright (C) 2006, 2007 University of Szeged, Hungary + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + * Zoltan Sogor + */ + +/* + * This file implements UBIFS I/O subsystem which provides various I/O-related + * helper functions (reading/writing/checking/validating nodes) and implements + * write-buffering support. Write buffers help to save space which otherwise + * would have been wasted for padding to the nearest minimal I/O unit boundary. + * Instead, data first goes to the write-buffer and is flushed when the + * buffer is full or when it is not used for some time (by timer). This is + * similar to the mechanism is used by JFFS2. + * + * Write-buffers are defined by 'struct ubifs_wbuf' objects and protected by + * mutexes defined inside these objects. Since sometimes upper-level code + * has to lock the write-buffer (e.g. journal space reservation code), many + * functions related to write-buffers have "nolock" suffix which means that the + * caller has to lock the write-buffer before calling this function. + * + * UBIFS stores nodes at 64 bit-aligned addresses. If the node length is not + * aligned, UBIFS starts the next node from the aligned address, and the padded + * bytes may contain any rubbish. In other words, UBIFS does not put padding + * bytes in those small gaps. Common headers of nodes store real node lengths, + * not aligned lengths. Indexing nodes also store real lengths in branches. + * + * UBIFS uses padding when it pads to the next min. I/O unit. In this case it + * uses padding nodes or padding bytes, if the padding node does not fit. + * + * All UBIFS nodes are protected by CRC checksums and UBIFS checks all nodes + * every time they are read from the flash media. + */ + +#include +#include "ubifs.h" + +/** + * ubifs_ro_mode - switch UBIFS to read read-only mode. + * @c: UBIFS file-system description object + * @err: error code which is the reason of switching to R/O mode + */ +void ubifs_ro_mode(struct ubifs_info *c, int err) +{ + if (!c->ro_media) { + c->ro_media = 1; + c->no_chk_data_crc = 0; + ubifs_warn("switched to read-only mode, error %d", err); + dbg_dump_stack(); + } +} + +/** + * ubifs_check_node - check node. + * @c: UBIFS file-system description object + * @buf: node to check + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * @quiet: print no messages + * @must_chk_crc: indicates whether to always check the CRC + * + * This function checks node magic number and CRC checksum. This function also + * validates node length to prevent UBIFS from becoming crazy when an attacker + * feeds it a file-system image with incorrect nodes. For example, too large + * node length in the common header could cause UBIFS to read memory outside of + * allocated buffer when checking the CRC checksum. + * + * This function may skip data nodes CRC checking if @c->no_chk_data_crc is + * true, which is controlled by corresponding UBIFS mount option. However, if + * @must_chk_crc is true, then @c->no_chk_data_crc is ignored and CRC is + * checked. Similarly, if @c->always_chk_crc is true, @c->no_chk_data_crc is + * ignored and CRC is checked. + * + * This function returns zero in case of success and %-EUCLEAN in case of bad + * CRC or magic. + */ +int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, + int offs, int quiet, int must_chk_crc) +{ + int err = -EINVAL, type, node_len; + uint32_t crc, node_crc, magic; + const struct ubifs_ch *ch = buf; + + ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + + magic = le32_to_cpu(ch->magic); + if (magic != UBIFS_NODE_MAGIC) { + if (!quiet) + ubifs_err("bad magic %#08x, expected %#08x", + magic, UBIFS_NODE_MAGIC); + err = -EUCLEAN; + goto out; + } + + type = ch->node_type; + if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) { + if (!quiet) + ubifs_err("bad node type %d", type); + goto out; + } + + node_len = le32_to_cpu(ch->len); + if (node_len + offs > c->leb_size) + goto out_len; + + if (c->ranges[type].max_len == 0) { + if (node_len != c->ranges[type].len) + goto out_len; + } else if (node_len < c->ranges[type].min_len || + node_len > c->ranges[type].max_len) + goto out_len; + + if (!must_chk_crc && type == UBIFS_DATA_NODE && !c->always_chk_crc && + c->no_chk_data_crc) + return 0; + + crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); + node_crc = le32_to_cpu(ch->crc); + if (crc != node_crc) { + if (!quiet) + ubifs_err("bad CRC: calculated %#08x, read %#08x", + crc, node_crc); + err = -EUCLEAN; + goto out; + } + + return 0; + +out_len: + if (!quiet) + ubifs_err("bad node length %d", node_len); +out: + if (!quiet) { + ubifs_err("bad node at LEB %d:%d", lnum, offs); + dbg_dump_node(c, buf); + dbg_dump_stack(); + } + return err; +} + +/** + * ubifs_pad - pad flash space. + * @c: UBIFS file-system description object + * @buf: buffer to put padding to + * @pad: how many bytes to pad + * + * The flash media obliges us to write only in chunks of %c->min_io_size and + * when we have to write less data we add padding node to the write-buffer and + * pad it to the next minimal I/O unit's boundary. Padding nodes help when the + * media is being scanned. If the amount of wasted space is not enough to fit a + * padding node which takes %UBIFS_PAD_NODE_SZ bytes, we write padding bytes + * pattern (%UBIFS_PADDING_BYTE). + * + * Padding nodes are also used to fill gaps when the "commit-in-gaps" method is + * used. + */ +void ubifs_pad(const struct ubifs_info *c, void *buf, int pad) +{ + uint32_t crc; + + ubifs_assert(pad >= 0 && !(pad & 7)); + + if (pad >= UBIFS_PAD_NODE_SZ) { + struct ubifs_ch *ch = buf; + struct ubifs_pad_node *pad_node = buf; + + ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC); + ch->node_type = UBIFS_PAD_NODE; + ch->group_type = UBIFS_NO_NODE_GROUP; + ch->padding[0] = ch->padding[1] = 0; + ch->sqnum = 0; + ch->len = cpu_to_le32(UBIFS_PAD_NODE_SZ); + pad -= UBIFS_PAD_NODE_SZ; + pad_node->pad_len = cpu_to_le32(pad); + crc = crc32(UBIFS_CRC32_INIT, buf + 8, UBIFS_PAD_NODE_SZ - 8); + ch->crc = cpu_to_le32(crc); + memset(buf + UBIFS_PAD_NODE_SZ, 0, pad); + } else if (pad > 0) + /* Too little space, padding node won't fit */ + memset(buf, UBIFS_PADDING_BYTE, pad); +} + +/** + * next_sqnum - get next sequence number. + * @c: UBIFS file-system description object + */ +static unsigned long long next_sqnum(struct ubifs_info *c) +{ + unsigned long long sqnum; + + spin_lock(&c->cnt_lock); + sqnum = ++c->max_sqnum; + spin_unlock(&c->cnt_lock); + + if (unlikely(sqnum >= SQNUM_WARN_WATERMARK)) { + if (sqnum >= SQNUM_WATERMARK) { + ubifs_err("sequence number overflow %llu, end of life", + sqnum); + ubifs_ro_mode(c, -EINVAL); + } + ubifs_warn("running out of sequence numbers, end of life soon"); + } + + return sqnum; +} + +/** + * ubifs_prepare_node - prepare node to be written to flash. + * @c: UBIFS file-system description object + * @node: the node to pad + * @len: node length + * @pad: if the buffer has to be padded + * + * This function prepares node at @node to be written to the media - it + * calculates node CRC, fills the common header, and adds proper padding up to + * the next minimum I/O unit if @pad is not zero. + */ +void ubifs_prepare_node(struct ubifs_info *c, void *node, int len, int pad) +{ + uint32_t crc; + struct ubifs_ch *ch = node; + unsigned long long sqnum = next_sqnum(c); + + ubifs_assert(len >= UBIFS_CH_SZ); + + ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC); + ch->len = cpu_to_le32(len); + ch->group_type = UBIFS_NO_NODE_GROUP; + ch->sqnum = cpu_to_le64(sqnum); + ch->padding[0] = ch->padding[1] = 0; + crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8); + ch->crc = cpu_to_le32(crc); + + if (pad) { + len = ALIGN(len, 8); + pad = ALIGN(len, c->min_io_size) - len; + ubifs_pad(c, node + len, pad); + } +} + +/** + * ubifs_prep_grp_node - prepare node of a group to be written to flash. + * @c: UBIFS file-system description object + * @node: the node to pad + * @len: node length + * @last: indicates the last node of the group + * + * This function prepares node at @node to be written to the media - it + * calculates node CRC and fills the common header. + */ +void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last) +{ + uint32_t crc; + struct ubifs_ch *ch = node; + unsigned long long sqnum = next_sqnum(c); + + ubifs_assert(len >= UBIFS_CH_SZ); + + ch->magic = cpu_to_le32(UBIFS_NODE_MAGIC); + ch->len = cpu_to_le32(len); + if (last) + ch->group_type = UBIFS_LAST_OF_NODE_GROUP; + else + ch->group_type = UBIFS_IN_NODE_GROUP; + ch->sqnum = cpu_to_le64(sqnum); + ch->padding[0] = ch->padding[1] = 0; + crc = crc32(UBIFS_CRC32_INIT, node + 8, len - 8); + ch->crc = cpu_to_le32(crc); +} + +/** + * wbuf_timer_callback - write-buffer timer callback function. + * @data: timer data (write-buffer descriptor) + * + * This function is called when the write-buffer timer expires. + */ +static void wbuf_timer_callback_nolock(unsigned long data) +{ + struct ubifs_wbuf *wbuf = (struct ubifs_wbuf *)data; + + wbuf->need_sync = 1; + wbuf->c->need_wbuf_sync = 1; + ubifs_wake_up_bgt(wbuf->c); +} + +/** + * new_wbuf_timer - start new write-buffer timer. + * @wbuf: write-buffer descriptor + */ +static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) +{ + ubifs_assert(!timer_pending(&wbuf->timer)); + + if (!wbuf->timeout) + return; + + wbuf->timer.expires = jiffies + wbuf->timeout; + add_timer(&wbuf->timer); +} + +/** + * cancel_wbuf_timer - cancel write-buffer timer. + * @wbuf: write-buffer descriptor + */ +static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) +{ + /* + * If the syncer is waiting for the lock (from the background thread's + * context) and another task is changing write-buffer then the syncing + * should be canceled. + */ + wbuf->need_sync = 0; + del_timer(&wbuf->timer); +} + +/** + * ubifs_wbuf_sync_nolock - synchronize write-buffer. + * @wbuf: write-buffer to synchronize + * + * This function synchronizes write-buffer @buf and returns zero in case of + * success or a negative error code in case of failure. + */ +int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf) +{ + struct ubifs_info *c = wbuf->c; + int err, dirt; + + cancel_wbuf_timer_nolock(wbuf); + if (!wbuf->used || wbuf->lnum == -1) + /* Write-buffer is empty or not seeked */ + return 0; + + dbg_io("LEB %d:%d, %d bytes", + wbuf->lnum, wbuf->offs, wbuf->used); + ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY)); + ubifs_assert(!(wbuf->avail & 7)); + ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size); + + if (c->ro_media) + return -EROFS; + + ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail); + err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, + c->min_io_size, wbuf->dtype); + if (err) { + ubifs_err("cannot write %d bytes to LEB %d:%d", + c->min_io_size, wbuf->lnum, wbuf->offs); + dbg_dump_stack(); + return err; + } + + dirt = wbuf->avail; + + spin_lock(&wbuf->lock); + wbuf->offs += c->min_io_size; + wbuf->avail = c->min_io_size; + wbuf->used = 0; + wbuf->next_ino = 0; + spin_unlock(&wbuf->lock); + + if (wbuf->sync_callback) + err = wbuf->sync_callback(c, wbuf->lnum, + c->leb_size - wbuf->offs, dirt); + return err; +} + +/** + * ubifs_wbuf_seek_nolock - seek write-buffer. + * @wbuf: write-buffer + * @lnum: logical eraseblock number to seek to + * @offs: logical eraseblock offset to seek to + * @dtype: data type + * + * This function targets the write buffer to logical eraseblock @lnum:@offs. + * The write-buffer is synchronized if it is not empty. Returns zero in case of + * success and a negative error code in case of failure. + */ +int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, + int dtype) +{ + const struct ubifs_info *c = wbuf->c; + + dbg_io("LEB %d:%d", lnum, offs); + ubifs_assert(lnum >= 0 && lnum < c->leb_cnt); + ubifs_assert(offs >= 0 && offs <= c->leb_size); + ubifs_assert(offs % c->min_io_size == 0 && !(offs & 7)); + ubifs_assert(lnum != wbuf->lnum); + + if (wbuf->used > 0) { + int err = ubifs_wbuf_sync_nolock(wbuf); + + if (err) + return err; + } + + spin_lock(&wbuf->lock); + wbuf->lnum = lnum; + wbuf->offs = offs; + wbuf->avail = c->min_io_size; + wbuf->used = 0; + spin_unlock(&wbuf->lock); + wbuf->dtype = dtype; + + return 0; +} + +/** + * ubifs_bg_wbufs_sync - synchronize write-buffers. + * @c: UBIFS file-system description object + * + * This function is called by background thread to synchronize write-buffers. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_bg_wbufs_sync(struct ubifs_info *c) +{ + int err, i; + + if (!c->need_wbuf_sync) + return 0; + c->need_wbuf_sync = 0; + + if (c->ro_media) { + err = -EROFS; + goto out_timers; + } + + dbg_io("synchronize"); + for (i = 0; i < c->jhead_cnt; i++) { + struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf; + + cond_resched(); + + /* + * If the mutex is locked then wbuf is being changed, so + * synchronization is not necessary. + */ + if (mutex_is_locked(&wbuf->io_mutex)) + continue; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + if (!wbuf->need_sync) { + mutex_unlock(&wbuf->io_mutex); + continue; + } + + err = ubifs_wbuf_sync_nolock(wbuf); + mutex_unlock(&wbuf->io_mutex); + if (err) { + ubifs_err("cannot sync write-buffer, error %d", err); + ubifs_ro_mode(c, err); + goto out_timers; + } + } + + return 0; + +out_timers: + /* Cancel all timers to prevent repeated errors */ + for (i = 0; i < c->jhead_cnt; i++) { + struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + cancel_wbuf_timer_nolock(wbuf); + mutex_unlock(&wbuf->io_mutex); + } + return err; +} + +/** + * ubifs_wbuf_write_nolock - write data to flash via write-buffer. + * @wbuf: write-buffer + * @buf: node to write + * @len: node length + * + * This function writes data to flash via write-buffer @wbuf. This means that + * the last piece of the node won't reach the flash media immediately if it + * does not take whole minimal I/O unit. Instead, the node will sit in RAM + * until the write-buffer is synchronized (e.g., by timer). + * + * This function returns zero in case of success and a negative error code in + * case of failure. If the node cannot be written because there is no more + * space in this logical eraseblock, %-ENOSPC is returned. + */ +int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len) +{ + struct ubifs_info *c = wbuf->c; + int err, written, n, aligned_len = ALIGN(len, 8), offs; + + dbg_io("%d bytes (%s) to wbuf at LEB %d:%d", len, + dbg_ntype(((struct ubifs_ch *)buf)->node_type), wbuf->lnum, + wbuf->offs + wbuf->used); + ubifs_assert(len > 0 && wbuf->lnum >= 0 && wbuf->lnum < c->leb_cnt); + ubifs_assert(wbuf->offs >= 0 && wbuf->offs % c->min_io_size == 0); + ubifs_assert(!(wbuf->offs & 7) && wbuf->offs <= c->leb_size); + ubifs_assert(wbuf->avail > 0 && wbuf->avail <= c->min_io_size); + ubifs_assert(mutex_is_locked(&wbuf->io_mutex)); + + if (c->leb_size - wbuf->offs - wbuf->used < aligned_len) { + err = -ENOSPC; + goto out; + } + + cancel_wbuf_timer_nolock(wbuf); + + if (c->ro_media) + return -EROFS; + + if (aligned_len <= wbuf->avail) { + /* + * The node is not very large and fits entirely within + * write-buffer. + */ + memcpy(wbuf->buf + wbuf->used, buf, len); + + if (aligned_len == wbuf->avail) { + dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum, + wbuf->offs); + err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, + wbuf->offs, c->min_io_size, + wbuf->dtype); + if (err) + goto out; + + spin_lock(&wbuf->lock); + wbuf->offs += c->min_io_size; + wbuf->avail = c->min_io_size; + wbuf->used = 0; + wbuf->next_ino = 0; + spin_unlock(&wbuf->lock); + } else { + spin_lock(&wbuf->lock); + wbuf->avail -= aligned_len; + wbuf->used += aligned_len; + spin_unlock(&wbuf->lock); + } + + goto exit; + } + + /* + * The node is large enough and does not fit entirely within current + * minimal I/O unit. We have to fill and flush write-buffer and switch + * to the next min. I/O unit. + */ + dbg_io("flush wbuf to LEB %d:%d", wbuf->lnum, wbuf->offs); + memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail); + err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs, + c->min_io_size, wbuf->dtype); + if (err) + goto out; + + offs = wbuf->offs + c->min_io_size; + len -= wbuf->avail; + aligned_len -= wbuf->avail; + written = wbuf->avail; + + /* + * The remaining data may take more whole min. I/O units, so write the + * remains multiple to min. I/O unit size directly to the flash media. + * We align node length to 8-byte boundary because we anyway flash wbuf + * if the remaining space is less than 8 bytes. + */ + n = aligned_len >> c->min_io_shift; + if (n) { + n <<= c->min_io_shift; + dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum, offs); + err = ubi_leb_write(c->ubi, wbuf->lnum, buf + written, offs, n, + wbuf->dtype); + if (err) + goto out; + offs += n; + aligned_len -= n; + len -= n; + written += n; + } + + spin_lock(&wbuf->lock); + if (aligned_len) + /* + * And now we have what's left and what does not take whole + * min. I/O unit, so write it to the write-buffer and we are + * done. + */ + memcpy(wbuf->buf, buf + written, len); + + wbuf->offs = offs; + wbuf->used = aligned_len; + wbuf->avail = c->min_io_size - aligned_len; + wbuf->next_ino = 0; + spin_unlock(&wbuf->lock); + +exit: + if (wbuf->sync_callback) { + int free = c->leb_size - wbuf->offs - wbuf->used; + + err = wbuf->sync_callback(c, wbuf->lnum, free, 0); + if (err) + goto out; + } + + if (wbuf->used) + new_wbuf_timer_nolock(wbuf); + + return 0; + +out: + ubifs_err("cannot write %d bytes to LEB %d:%d, error %d", + len, wbuf->lnum, wbuf->offs, err); + dbg_dump_node(c, buf); + dbg_dump_stack(); + dbg_dump_leb(c, wbuf->lnum); + return err; +} + +/** + * ubifs_write_node - write node to the media. + * @c: UBIFS file-system description object + * @buf: the node to write + * @len: node length + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * @dtype: node life-time hint (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN) + * + * This function automatically fills node magic number, assigns sequence + * number, and calculates node CRC checksum. The length of the @buf buffer has + * to be aligned to the minimal I/O unit size. This function automatically + * appends padding node and padding bytes if needed. Returns zero in case of + * success and a negative error code in case of failure. + */ +int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum, + int offs, int dtype) +{ + int err, buf_len = ALIGN(len, c->min_io_size); + + dbg_io("LEB %d:%d, %s, length %d (aligned %d)", + lnum, offs, dbg_ntype(((struct ubifs_ch *)buf)->node_type), len, + buf_len); + ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(offs % c->min_io_size == 0 && offs < c->leb_size); + + if (c->ro_media) + return -EROFS; + + ubifs_prepare_node(c, buf, len, 1); + err = ubi_leb_write(c->ubi, lnum, buf, offs, buf_len, dtype); + if (err) { + ubifs_err("cannot write %d bytes to LEB %d:%d, error %d", + buf_len, lnum, offs, err); + dbg_dump_node(c, buf); + dbg_dump_stack(); + } + + return err; +} + +/** + * ubifs_read_node_wbuf - read node from the media or write-buffer. + * @wbuf: wbuf to check for un-written data + * @buf: buffer to read to + * @type: node type + * @len: node length + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * + * This function reads a node of known type and length, checks it and stores + * in @buf. If the node partially or fully sits in the write-buffer, this + * function takes data from the buffer, otherwise it reads the flash media. + * Returns zero in case of success, %-EUCLEAN if CRC mismatched and a negative + * error code in case of failure. + */ +int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, + int lnum, int offs) +{ + const struct ubifs_info *c = wbuf->c; + int err, rlen, overlap; + struct ubifs_ch *ch = buf; + + dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len); + ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT); + + spin_lock(&wbuf->lock); + overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs); + if (!overlap) { + /* We may safely unlock the write-buffer and read the data */ + spin_unlock(&wbuf->lock); + return ubifs_read_node(c, buf, type, len, lnum, offs); + } + + /* Don't read under wbuf */ + rlen = wbuf->offs - offs; + if (rlen < 0) + rlen = 0; + + /* Copy the rest from the write-buffer */ + memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen); + spin_unlock(&wbuf->lock); + + if (rlen > 0) { + /* Read everything that goes before write-buffer */ + err = ubi_read(c->ubi, lnum, buf, offs, rlen); + if (err && err != -EBADMSG) { + ubifs_err("failed to read node %d from LEB %d:%d, " + "error %d", type, lnum, offs, err); + dbg_dump_stack(); + return err; + } + } + + if (type != ch->node_type) { + ubifs_err("bad node type (%d but expected %d)", + ch->node_type, type); + goto out; + } + + err = ubifs_check_node(c, buf, lnum, offs, 0, 0); + if (err) { + ubifs_err("expected node type %d", type); + return err; + } + + rlen = le32_to_cpu(ch->len); + if (rlen != len) { + ubifs_err("bad node length %d, expected %d", rlen, len); + goto out; + } + + return 0; + +out: + ubifs_err("bad node at LEB %d:%d", lnum, offs); + dbg_dump_node(c, buf); + dbg_dump_stack(); + return -EINVAL; +} + +/** + * ubifs_read_node - read node. + * @c: UBIFS file-system description object + * @buf: buffer to read to + * @type: node type + * @len: node length (not aligned) + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * + * This function reads a node of known type and and length, checks it and + * stores in @buf. Returns zero in case of success, %-EUCLEAN if CRC mismatched + * and a negative error code in case of failure. + */ +int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, + int lnum, int offs) +{ + int err, l; + struct ubifs_ch *ch = buf; + + dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len); + ubifs_assert(lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(len >= UBIFS_CH_SZ && offs + len <= c->leb_size); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + ubifs_assert(type >= 0 && type < UBIFS_NODE_TYPES_CNT); + + err = ubi_read(c->ubi, lnum, buf, offs, len); + if (err && err != -EBADMSG) { + ubifs_err("cannot read node %d from LEB %d:%d, error %d", + type, lnum, offs, err); + return err; + } + + if (type != ch->node_type) { + ubifs_err("bad node type (%d but expected %d)", + ch->node_type, type); + goto out; + } + + err = ubifs_check_node(c, buf, lnum, offs, 0, 0); + if (err) { + ubifs_err("expected node type %d", type); + return err; + } + + l = le32_to_cpu(ch->len); + if (l != len) { + ubifs_err("bad node length %d, expected %d", l, len); + goto out; + } + + return 0; + +out: + ubifs_err("bad node at LEB %d:%d", lnum, offs); + dbg_dump_node(c, buf); + dbg_dump_stack(); + return -EINVAL; +} + +/** + * ubifs_wbuf_init - initialize write-buffer. + * @c: UBIFS file-system description object + * @wbuf: write-buffer to initialize + * + * This function initializes write buffer. Returns zero in case of success + * %-ENOMEM in case of failure. + */ +int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) +{ + size_t size; + + wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL); + if (!wbuf->buf) + return -ENOMEM; + + size = (c->min_io_size / UBIFS_CH_SZ + 1) * sizeof(ino_t); + wbuf->inodes = kmalloc(size, GFP_KERNEL); + if (!wbuf->inodes) { + kfree(wbuf->buf); + wbuf->buf = NULL; + return -ENOMEM; + } + + wbuf->used = 0; + wbuf->lnum = wbuf->offs = -1; + wbuf->avail = c->min_io_size; + wbuf->dtype = UBI_UNKNOWN; + wbuf->sync_callback = NULL; + mutex_init(&wbuf->io_mutex); + spin_lock_init(&wbuf->lock); + + wbuf->c = c; + init_timer(&wbuf->timer); + wbuf->timer.function = wbuf_timer_callback_nolock; + wbuf->timer.data = (unsigned long)wbuf; + wbuf->timeout = DEFAULT_WBUF_TIMEOUT; + wbuf->next_ino = 0; + + return 0; +} + +/** + * ubifs_wbuf_add_ino_nolock - add an inode number into the wbuf inode array. + * @wbuf: the write-buffer whereto add + * @inum: the inode number + * + * This function adds an inode number to the inode array of the write-buffer. + */ +void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum) +{ + if (!wbuf->buf) + /* NOR flash or something similar */ + return; + + spin_lock(&wbuf->lock); + if (wbuf->used) + wbuf->inodes[wbuf->next_ino++] = inum; + spin_unlock(&wbuf->lock); +} + +/** + * wbuf_has_ino - returns if the wbuf contains data from the inode. + * @wbuf: the write-buffer + * @inum: the inode number + * + * This function returns with %1 if the write-buffer contains some data from the + * given inode otherwise it returns with %0. + */ +static int wbuf_has_ino(struct ubifs_wbuf *wbuf, ino_t inum) +{ + int i, ret = 0; + + spin_lock(&wbuf->lock); + for (i = 0; i < wbuf->next_ino; i++) + if (inum == wbuf->inodes[i]) { + ret = 1; + break; + } + spin_unlock(&wbuf->lock); + + return ret; +} + +/** + * ubifs_sync_wbufs_by_inode - synchronize write-buffers for an inode. + * @c: UBIFS file-system description object + * @inode: inode to synchronize + * + * This function synchronizes write-buffers which contain nodes belonging to + * @inode. Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode) +{ + int i, err = 0; + + for (i = 0; i < c->jhead_cnt; i++) { + struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf; + + if (i == GCHD) + /* + * GC head is special, do not look at it. Even if the + * head contains something related to this inode, it is + * a _copy_ of corresponding on-flash node which sits + * somewhere else. + */ + continue; + + if (!wbuf_has_ino(wbuf, inode->i_ino)) + continue; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + if (wbuf_has_ino(wbuf, inode->i_ino)) + err = ubifs_wbuf_sync_nolock(wbuf); + mutex_unlock(&wbuf->io_mutex); + + if (err) { + ubifs_ro_mode(c, err); + return err; + } + } + return 0; +} diff -Nurd linux-2.6.24/fs/ubifs/ioctl.c ubifs-v2.6.24/fs/ubifs/ioctl.c --- linux-2.6.24/fs/ubifs/ioctl.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/ioctl.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,196 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * Copyright (C) 2006, 2007 University of Szeged, Hungary + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Zoltan Sogor + * Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* This file implements EXT2-compatible extended attribute ioctl() calls */ + +#include +#include +#include "ubifs.h" + +/** + * ubifs_set_inode_flags - set VFS inode flags. + * @inode: VFS inode to set flags for + * + * This function propagates flags from UBIFS inode object to VFS inode object. + */ +void ubifs_set_inode_flags(struct inode *inode) +{ + unsigned int flags = ubifs_inode(inode)->flags; + + inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC); + if (flags & UBIFS_SYNC_FL) + inode->i_flags |= S_SYNC; + if (flags & UBIFS_APPEND_FL) + inode->i_flags |= S_APPEND; + if (flags & UBIFS_IMMUTABLE_FL) + inode->i_flags |= S_IMMUTABLE; + if (flags & UBIFS_DIRSYNC_FL) + inode->i_flags |= S_DIRSYNC; +} + +/* + * ioctl2ubifs - convert ioctl inode flags to UBIFS inode flags. + * @ioctl_flags: flags to convert + * + * This function convert ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags + * (@UBIFS_COMPR_FL, etc). + */ +static int ioctl2ubifs(int ioctl_flags) +{ + int ubifs_flags = 0; + + if (ioctl_flags & FS_COMPR_FL) + ubifs_flags |= UBIFS_COMPR_FL; + if (ioctl_flags & FS_SYNC_FL) + ubifs_flags |= UBIFS_SYNC_FL; + if (ioctl_flags & FS_APPEND_FL) + ubifs_flags |= UBIFS_APPEND_FL; + if (ioctl_flags & FS_IMMUTABLE_FL) + ubifs_flags |= UBIFS_IMMUTABLE_FL; + if (ioctl_flags & FS_DIRSYNC_FL) + ubifs_flags |= UBIFS_DIRSYNC_FL; + + return ubifs_flags; +} + +/* + * ubifs2ioctl - convert UBIFS inode flags to ioctl inode flags. + * @ubifs_flags: flags to convert + * + * This function convert UBIFS (@UBIFS_COMPR_FL, etc) to ioctl flags + * (@FS_COMPR_FL, etc). + */ +static int ubifs2ioctl(int ubifs_flags) +{ + int ioctl_flags = 0; + + if (ubifs_flags & UBIFS_COMPR_FL) + ioctl_flags |= FS_COMPR_FL; + if (ubifs_flags & UBIFS_SYNC_FL) + ioctl_flags |= FS_SYNC_FL; + if (ubifs_flags & UBIFS_APPEND_FL) + ioctl_flags |= FS_APPEND_FL; + if (ubifs_flags & UBIFS_IMMUTABLE_FL) + ioctl_flags |= FS_IMMUTABLE_FL; + if (ubifs_flags & UBIFS_DIRSYNC_FL) + ioctl_flags |= FS_DIRSYNC_FL; + + return ioctl_flags; +} + +static int setflags(struct inode *inode, int flags) +{ + int oldflags, err, release; + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_budget_req req = { .dirtied_ino = 1, + .dirtied_ino_d = ui->data_len }; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + */ + mutex_lock(&ui->ui_mutex); + oldflags = ubifs2ioctl(ui->flags); + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto out_unlock; + } + } + + ui->flags = ioctl2ubifs(flags); + ubifs_set_inode_flags(inode); + inode->i_ctime = ubifs_current_time(inode); + release = ui->dirty; + mark_inode_dirty_sync(inode); + mutex_unlock(&ui->ui_mutex); + + if (release) + ubifs_release_budget(c, &req); + if (IS_SYNC(inode)) + err = write_inode_now(inode, 1); + return err; + +out_unlock: + ubifs_err("can't modify inode %lu attributes", inode->i_ino); + mutex_unlock(&ui->ui_mutex); + ubifs_release_budget(c, &req); + return err; +} + +long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int flags; + struct inode *inode = file->f_path.dentry->d_inode; + + switch (cmd) { + case FS_IOC_GETFLAGS: + flags = ubifs2ioctl(ubifs_inode(inode)->flags); + + dbg_gen("get flags: %#x, i_flags %#x", flags, inode->i_flags); + return put_user(flags, (int __user *) arg); + + case FS_IOC_SETFLAGS: { + if (IS_RDONLY(inode)) + return -EROFS; + + if (!is_owner_or_cap(inode)) + return -EACCES; + + if (get_user(flags, (int __user *) arg)) + return -EFAULT; + + if (!S_ISDIR(inode->i_mode)) + flags &= ~FS_DIRSYNC_FL; + + dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags); + return setflags(inode, flags); + } + + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + default: + return -ENOIOCTLCMD; + } + return ubifs_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); +} +#endif diff -Nurd linux-2.6.24/fs/ubifs/journal.c ubifs-v2.6.24/fs/ubifs/journal.c --- linux-2.6.24/fs/ubifs/journal.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/journal.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,1442 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS journal. + * + * The journal consists of 2 parts - the log and bud LEBs. The log has fixed + * length and position, while a bud logical eraseblock is any LEB in the main + * area. Buds contain file system data - data nodes, inode nodes, etc. The log + * contains only references to buds and some other stuff like commit + * start node. The idea is that when we commit the journal, we do + * not copy the data, the buds just become indexed. Since after the commit the + * nodes in bud eraseblocks become leaf nodes of the file system index tree, we + * use term "bud". Analogy is obvious, bud eraseblocks contain nodes which will + * become leafs in the future. + * + * The journal is multi-headed because we want to write data to the journal as + * optimally as possible. It is nice to have nodes belonging to the same inode + * in one LEB, so we may write data owned by different inodes to different + * journal heads, although at present only one data head is used. + * + * For recovery reasons, the base head contains all inode nodes, all directory + * entry nodes and all truncate nodes. This means that the other heads contain + * only data nodes. + * + * Bud LEBs may be half-indexed. For example, if the bud was not full at the + * time of commit, the bud is retained to continue to be used in the journal, + * even though the "front" of the LEB is now indexed. In that case, the log + * reference contains the offset where the bud starts for the purposes of the + * journal. + * + * The journal size has to be limited, because the larger is the journal, the + * longer it takes to mount UBIFS (scanning the journal) and the more memory it + * takes (indexing in the TNC). + * + * All the journal write operations like 'ubifs_jnl_update()' here, which write + * multiple UBIFS nodes to the journal at one go, are atomic with respect to + * unclean reboots. Should the unclean reboot happen, the recovery code drops + * all the nodes. + */ + +#include "ubifs.h" + +/** + * zero_ino_node_unused - zero out unused fields of an on-flash inode node. + * @ino: the inode to zero out + */ +static inline void zero_ino_node_unused(struct ubifs_ino_node *ino) +{ + memset(ino->padding1, 0, 4); + memset(ino->padding2, 0, 26); +} + +/** + * zero_dent_node_unused - zero out unused fields of an on-flash directory + * entry node. + * @dent: the directory entry to zero out + */ +static inline void zero_dent_node_unused(struct ubifs_dent_node *dent) +{ + dent->padding1 = 0; + memset(dent->padding2, 0, 4); +} + +/** + * zero_data_node_unused - zero out unused fields of an on-flash data node. + * @data: the data node to zero out + */ +static inline void zero_data_node_unused(struct ubifs_data_node *data) +{ + memset(data->padding, 0, 2); +} + +/** + * zero_trun_node_unused - zero out unused fields of an on-flash truncation + * node. + * @trun: the truncation node to zero out + */ +static inline void zero_trun_node_unused(struct ubifs_trun_node *trun) +{ + memset(trun->padding, 0, 12); +} + +/** + * reserve_space - reserve space in the journal. + * @c: UBIFS file-system description object + * @jhead: journal head number + * @len: node length + * + * This function reserves space in journal head @head. If the reservation + * succeeded, the journal head stays locked and later has to be unlocked using + * 'release_head()'. 'write_node()' and 'write_head()' functions also unlock + * it. Returns zero in case of success, %-EAGAIN if commit has to be done, and + * other negative error codes in case of other failures. + */ +static int reserve_space(struct ubifs_info *c, int jhead, int len) +{ + int err = 0, err1, retries = 0, avail, lnum, offs, squeeze; + struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; + + /* + * Typically, the base head has smaller nodes written to it, so it is + * better to try to allocate space at the ends of eraseblocks. This is + * what the squeeze parameter does. + */ + squeeze = (jhead == BASEHD); +again: + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + + if (c->ro_media) { + err = -EROFS; + goto out_unlock; + } + + avail = c->leb_size - wbuf->offs - wbuf->used; + if (wbuf->lnum != -1 && avail >= len) + return 0; + + /* + * Write buffer wasn't seek'ed or there is no enough space - look for an + * LEB with some empty space. + */ + lnum = ubifs_find_free_space(c, len, &offs, squeeze); + if (lnum >= 0) { + /* Found an LEB, add it to the journal head */ + err = ubifs_add_bud_to_log(c, jhead, lnum, offs); + if (err) + goto out_return; + /* A new bud was successfully allocated and added to the log */ + goto out; + } + + err = lnum; + if (err != -ENOSPC) + goto out_unlock; + + /* + * No free space, we have to run garbage collector to make + * some. But the write-buffer mutex has to be unlocked because + * GC also takes it. + */ + dbg_jnl("no free space jhead %d, run GC", jhead); + mutex_unlock(&wbuf->io_mutex); + + lnum = ubifs_garbage_collect(c, 0); + if (lnum < 0) { + err = lnum; + if (err != -ENOSPC) + return err; + + /* + * GC could not make a free LEB. But someone else may + * have allocated new bud for this journal head, + * because we dropped @wbuf->io_mutex, so try once + * again. + */ + dbg_jnl("GC couldn't make a free LEB for jhead %d", jhead); + if (retries++ < 2) { + dbg_jnl("retry (%d)", retries); + goto again; + } + + dbg_jnl("return -ENOSPC"); + return err; + } + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + dbg_jnl("got LEB %d for jhead %d", lnum, jhead); + avail = c->leb_size - wbuf->offs - wbuf->used; + + if (wbuf->lnum != -1 && avail >= len) { + /* + * Someone else has switched the journal head and we have + * enough space now. This happens when more then one process is + * trying to write to the same journal head at the same time. + */ + dbg_jnl("return LEB %d back, already have LEB %d:%d", + lnum, wbuf->lnum, wbuf->offs + wbuf->used); + err = ubifs_return_leb(c, lnum); + if (err) + goto out_unlock; + return 0; + } + + err = ubifs_add_bud_to_log(c, jhead, lnum, 0); + if (err) + goto out_return; + offs = 0; + +out: + err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype); + if (err) + goto out_unlock; + + return 0; + +out_unlock: + mutex_unlock(&wbuf->io_mutex); + return err; + +out_return: + /* An error occurred and the LEB has to be returned to lprops */ + ubifs_assert(err < 0); + err1 = ubifs_return_leb(c, lnum); + if (err1 && err == -EAGAIN) + /* + * Return original error code only if it is not %-EAGAIN, + * which is not really an error. Otherwise, return the error + * code of 'ubifs_return_leb()'. + */ + err = err1; + mutex_unlock(&wbuf->io_mutex); + return err; +} + +/** + * write_node - write node to a journal head. + * @c: UBIFS file-system description object + * @jhead: journal head + * @node: node to write + * @len: node length + * @lnum: LEB number written is returned here + * @offs: offset written is returned here + * + * This function writes a node to reserved space of journal head @jhead. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +static int write_node(struct ubifs_info *c, int jhead, void *node, int len, + int *lnum, int *offs) +{ + struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; + + ubifs_assert(jhead != GCHD); + + *lnum = c->jheads[jhead].wbuf.lnum; + *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; + + dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len); + ubifs_prepare_node(c, node, len, 0); + + return ubifs_wbuf_write_nolock(wbuf, node, len); +} + +/** + * write_head - write data to a journal head. + * @c: UBIFS file-system description object + * @jhead: journal head + * @buf: buffer to write + * @len: length to write + * @lnum: LEB number written is returned here + * @offs: offset written is returned here + * @sync: non-zero if the write-buffer has to by synchronized + * + * This function is the same as 'write_node()' but it does not assume the + * buffer it is writing is a node, so it does not prepare it (which means + * initializing common header and calculating CRC). + */ +static int write_head(struct ubifs_info *c, int jhead, void *buf, int len, + int *lnum, int *offs, int sync) +{ + int err; + struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; + + ubifs_assert(jhead != GCHD); + + *lnum = c->jheads[jhead].wbuf.lnum; + *offs = c->jheads[jhead].wbuf.offs + c->jheads[jhead].wbuf.used; + dbg_jnl("jhead %d, LEB %d:%d, len %d", jhead, *lnum, *offs, len); + + err = ubifs_wbuf_write_nolock(wbuf, buf, len); + if (err) + return err; + if (sync) + err = ubifs_wbuf_sync_nolock(wbuf); + return err; +} + +/** + * make_reservation - reserve journal space. + * @c: UBIFS file-system description object + * @jhead: journal head + * @len: how many bytes to reserve + * + * This function makes space reservation in journal head @jhead. The function + * takes the commit lock and locks the journal head, and the caller has to + * unlock the head and finish the reservation with 'finish_reservation()'. + * Returns zero in case of success and a negative error code in case of + * failure. + * + * Note, the journal head may be unlocked as soon as the data is written, while + * the commit lock has to be released after the data has been added to the + * TNC. + */ +static int make_reservation(struct ubifs_info *c, int jhead, int len) +{ + int err, cmt_retries = 0, nospc_retries = 0; + +again: + down_read(&c->commit_sem); + err = reserve_space(c, jhead, len); + if (!err) + return 0; + up_read(&c->commit_sem); + + if (err == -ENOSPC) { + /* + * GC could not make any progress. We should try to commit + * once because it could make some dirty space and GC would + * make progress, so make the error -EAGAIN so that the below + * will commit and re-try. + */ + if (nospc_retries++ < 2) { + dbg_jnl("no space, retry"); + err = -EAGAIN; + } + + /* + * This means that the budgeting is incorrect. We always have + * to be able to write to the media, because all operations are + * budgeted. Deletions are not budgeted, though, but we reserve + * an extra LEB for them. + */ + } + + if (err != -EAGAIN) + goto out; + + /* + * -EAGAIN means that the journal is full or too large, or the above + * code wants to do one commit. Do this and re-try. + */ + if (cmt_retries > 128) { + /* + * This should not happen unless the journal size limitations + * are too tough. + */ + ubifs_err("stuck in space allocation"); + err = -ENOSPC; + goto out; + } else if (cmt_retries > 32) + ubifs_warn("too many space allocation re-tries (%d)", + cmt_retries); + + dbg_jnl("-EAGAIN, commit and retry (retried %d times)", + cmt_retries); + cmt_retries += 1; + + err = ubifs_run_commit(c); + if (err) + return err; + goto again; + +out: + ubifs_err("cannot reserve %d bytes in jhead %d, error %d", + len, jhead, err); + if (err == -ENOSPC) { + /* This are some budgeting problems, print useful information */ + down_write(&c->commit_sem); + spin_lock(&c->space_lock); + dbg_dump_stack(); + dbg_dump_budg(c); + spin_unlock(&c->space_lock); + dbg_dump_lprops(c); + cmt_retries = dbg_check_lprops(c); + up_write(&c->commit_sem); + } + return err; +} + +/** + * release_head - release a journal head. + * @c: UBIFS file-system description object + * @jhead: journal head + * + * This function releases journal head @jhead which was locked by + * the 'make_reservation()' function. It has to be called after each successful + * 'make_reservation()' invocation. + */ +static inline void release_head(struct ubifs_info *c, int jhead) +{ + mutex_unlock(&c->jheads[jhead].wbuf.io_mutex); +} + +/** + * finish_reservation - finish a reservation. + * @c: UBIFS file-system description object + * + * This function finishes journal space reservation. It must be called after + * 'make_reservation()'. + */ +static void finish_reservation(struct ubifs_info *c) +{ + up_read(&c->commit_sem); +} + +/** + * get_dent_type - translate VFS inode mode to UBIFS directory entry type. + * @mode: inode mode + */ +static int get_dent_type(int mode) +{ + switch (mode & S_IFMT) { + case S_IFREG: + return UBIFS_ITYPE_REG; + case S_IFDIR: + return UBIFS_ITYPE_DIR; + case S_IFLNK: + return UBIFS_ITYPE_LNK; + case S_IFBLK: + return UBIFS_ITYPE_BLK; + case S_IFCHR: + return UBIFS_ITYPE_CHR; + case S_IFIFO: + return UBIFS_ITYPE_FIFO; + case S_IFSOCK: + return UBIFS_ITYPE_SOCK; + default: + BUG(); + } + return 0; +} + +/** + * pack_inode - pack an inode node. + * @c: UBIFS file-system description object + * @ino: buffer in which to pack inode node + * @inode: inode to pack + * @last: indicates the last node of the group + */ +static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino, + const struct inode *inode, int last) +{ + int data_len = 0, last_reference = !inode->i_nlink; + struct ubifs_inode *ui = ubifs_inode(inode); + + ino->ch.node_type = UBIFS_INO_NODE; + ino_key_init_flash(c, &ino->key, inode->i_ino); + ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum); + ino->atime_sec = cpu_to_le64(inode->i_atime.tv_sec); + ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + ino->ctime_sec = cpu_to_le64(inode->i_ctime.tv_sec); + ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec); + ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ino->uid = cpu_to_le32(inode->i_uid); + ino->gid = cpu_to_le32(inode->i_gid); + ino->mode = cpu_to_le32(inode->i_mode); + ino->flags = cpu_to_le32(ui->flags); + ino->size = cpu_to_le64(ui->ui_size); + ino->nlink = cpu_to_le32(inode->i_nlink); + ino->compr_type = cpu_to_le16(ui->compr_type); + ino->data_len = cpu_to_le32(ui->data_len); + ino->xattr_cnt = cpu_to_le32(ui->xattr_cnt); + ino->xattr_size = cpu_to_le32(ui->xattr_size); + ino->xattr_names = cpu_to_le32(ui->xattr_names); + zero_ino_node_unused(ino); + + /* + * Drop the attached data if this is a deletion inode, the data is not + * needed anymore. + */ + if (!last_reference) { + memcpy(ino->data, ui->data, ui->data_len); + data_len = ui->data_len; + } + + ubifs_prep_grp_node(c, ino, UBIFS_INO_NODE_SZ + data_len, last); +} + +/** + * mark_inode_clean - mark UBIFS inode as clean. + * @c: UBIFS file-system description object + * @ui: UBIFS inode to mark as clean + * + * This helper function marks UBIFS inode @ui as clean by cleaning the + * @ui->dirty flag and releasing its budget. Note, VFS may still treat the + * inode as dirty and try to write it back, but 'ubifs_write_inode()' would + * just do nothing. + */ +static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui) +{ + if (ui->dirty) + ubifs_release_dirty_inode_budget(c, ui); + ui->dirty = 0; +} + +/** + * ubifs_jnl_update - update inode. + * @c: UBIFS file-system description object + * @dir: parent inode or host inode in case of extended attributes + * @nm: directory entry name + * @inode: inode to update + * @deletion: indicates a directory entry deletion i.e unlink or rmdir + * @xent: non-zero if the directory entry is an extended attribute entry + * + * This function updates an inode by writing a directory entry (or extended + * attribute entry), the inode itself, and the parent directory inode (or the + * host inode) to the journal. + * + * The function writes the host inode @dir last, which is important in case of + * extended attributes. Indeed, then we guarantee that if the host inode gets + * synchronized (with 'fsync()'), and the write-buffer it sits in gets flushed, + * the extended attribute inode gets flushed too. And this is exactly what the + * user expects - synchronizing the host inode synchronizes its extended + * attributes. Similarly, this guarantees that if @dir is synchronized, its + * directory entry corresponding to @nm gets synchronized too. + * + * If the inode (@inode) or the parent directory (@dir) are synchronous, this + * function synchronizes the write-buffer. + * + * This function marks the @dir and @inode inodes as clean and returns zero on + * success. In case of failure, a negative error code is returned. + */ +int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, + const struct qstr *nm, const struct inode *inode, + int deletion, int xent) +{ + int err, dlen, ilen, len, lnum, ino_offs, dent_offs; + int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir); + int last_reference = !!(deletion && inode->i_nlink == 0); + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_dent_node *dent; + struct ubifs_ino_node *ino; + union ubifs_key dent_key, ino_key; + + dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu", + inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino); + ubifs_assert(dir_ui->data_len == 0); + ubifs_assert(mutex_is_locked(&dir_ui->ui_mutex)); + + dlen = UBIFS_DENT_NODE_SZ + nm->len + 1; + ilen = UBIFS_INO_NODE_SZ; + + /* + * If the last reference to the inode is being deleted, then there is + * no need to attach and write inode data, it is being deleted anyway. + * And if the inode is being deleted, no need to synchronize + * write-buffer even if the inode is synchronous. + */ + if (!last_reference) { + ilen += ui->data_len; + sync |= IS_SYNC(inode); + } + + aligned_dlen = ALIGN(dlen, 8); + aligned_ilen = ALIGN(ilen, 8); + len = aligned_dlen + aligned_ilen + UBIFS_INO_NODE_SZ; + dent = kmalloc(len, GFP_NOFS); + if (!dent) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, len); + if (err) + goto out_free; + + if (!xent) { + dent->ch.node_type = UBIFS_DENT_NODE; + dent_key_init(c, &dent_key, dir->i_ino, nm); + } else { + dent->ch.node_type = UBIFS_XENT_NODE; + xent_key_init(c, &dent_key, dir->i_ino, nm); + } + + key_write(c, &dent_key, dent->key); + dent->inum = deletion ? 0 : cpu_to_le64(inode->i_ino); + dent->type = get_dent_type(inode->i_mode); + dent->nlen = cpu_to_le16(nm->len); + memcpy(dent->name, nm->name, nm->len); + dent->name[nm->len] = '\0'; + zero_dent_node_unused(dent); + ubifs_prep_grp_node(c, dent, dlen, 0); + + ino = (void *)dent + aligned_dlen; + pack_inode(c, ino, inode, 0); + ino = (void *)ino + aligned_ilen; + pack_inode(c, ino, dir, 1); + + if (last_reference) { + err = ubifs_add_orphan(c, inode->i_ino); + if (err) { + release_head(c, BASEHD); + goto out_finish; + } + ui->del_cmtno = c->cmt_no; + } + + err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync); + if (err) + goto out_release; + if (!sync) { + struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf; + + ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino); + ubifs_wbuf_add_ino_nolock(wbuf, dir->i_ino); + } + release_head(c, BASEHD); + kfree(dent); + + if (deletion) { + err = ubifs_tnc_remove_nm(c, &dent_key, nm); + if (err) + goto out_ro; + err = ubifs_add_dirt(c, lnum, dlen); + } else + err = ubifs_tnc_add_nm(c, &dent_key, lnum, dent_offs, dlen, nm); + if (err) + goto out_ro; + + /* + * Note, we do not remove the inode from TNC even if the last reference + * to it has just been deleted, because the inode may still be opened. + * Instead, the inode has been added to orphan lists and the orphan + * subsystem will take further care about it. + */ + ino_key_init(c, &ino_key, inode->i_ino); + ino_offs = dent_offs + aligned_dlen; + err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, ilen); + if (err) + goto out_ro; + + ino_key_init(c, &ino_key, dir->i_ino); + ino_offs += aligned_ilen; + err = ubifs_tnc_add(c, &ino_key, lnum, ino_offs, UBIFS_INO_NODE_SZ); + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&ui->ui_lock); + ui->synced_i_size = ui->ui_size; + spin_unlock(&ui->ui_lock); + mark_inode_clean(c, ui); + mark_inode_clean(c, dir_ui); + return 0; + +out_finish: + finish_reservation(c); +out_free: + kfree(dent); + return err; + +out_release: + release_head(c, BASEHD); +out_ro: + ubifs_ro_mode(c, err); + if (last_reference) + ubifs_delete_orphan(c, inode->i_ino); + finish_reservation(c); + return err; +} + +/** + * ubifs_jnl_write_data - write a data node to the journal. + * @c: UBIFS file-system description object + * @inode: inode the data node belongs to + * @key: node key + * @buf: buffer to write + * @len: data length (must not exceed %UBIFS_BLOCK_SIZE) + * + * This function writes a data node to the journal. Returns %0 if the data node + * was successfully written, and a negative error code in case of failure. + */ +int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, + const union ubifs_key *key, const void *buf, int len) +{ + struct ubifs_data_node *data; + int err, lnum, offs, compr_type, out_len; + int dlen = UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR; + struct ubifs_inode *ui = ubifs_inode(inode); + + dbg_jnl("ino %lu, blk %u, len %d, key %s", + (unsigned long)key_inum(c, key), key_block(c, key), len, + DBGKEY(key)); + ubifs_assert(len <= UBIFS_BLOCK_SIZE); + + data = kmalloc(dlen, GFP_NOFS); + if (!data) + return -ENOMEM; + + data->ch.node_type = UBIFS_DATA_NODE; + key_write(c, key, &data->key); + data->size = cpu_to_le32(len); + zero_data_node_unused(data); + + if (!(ui->flags & UBIFS_COMPR_FL)) + /* Compression is disabled for this inode */ + compr_type = UBIFS_COMPR_NONE; + else + compr_type = ui->compr_type; + + out_len = dlen - UBIFS_DATA_NODE_SZ; + ubifs_compress(buf, len, &data->data, &out_len, &compr_type); + ubifs_assert(out_len <= UBIFS_BLOCK_SIZE); + + dlen = UBIFS_DATA_NODE_SZ + out_len; + data->compr_type = cpu_to_le16(compr_type); + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, DATAHD, dlen); + if (err) + goto out_free; + + err = write_node(c, DATAHD, data, dlen, &lnum, &offs); + if (err) + goto out_release; + ubifs_wbuf_add_ino_nolock(&c->jheads[DATAHD].wbuf, key_inum(c, key)); + release_head(c, DATAHD); + + err = ubifs_tnc_add(c, key, lnum, offs, dlen); + if (err) + goto out_ro; + + finish_reservation(c); + kfree(data); + return 0; + +out_release: + release_head(c, DATAHD); +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); +out_free: + kfree(data); + return err; +} + +/** + * ubifs_jnl_write_inode - flush inode to the journal. + * @c: UBIFS file-system description object + * @inode: inode to flush + * + * This function writes inode @inode to the journal. If the inode is + * synchronous, it also synchronizes the write-buffer. Returns zero in case of + * success and a negative error code in case of failure. + */ +int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) +{ + int err, lnum, offs; + struct ubifs_ino_node *ino; + struct ubifs_inode *ui = ubifs_inode(inode); + int sync = 0, len = UBIFS_INO_NODE_SZ, last_reference = !inode->i_nlink; + + dbg_jnl("ino %lu, nlink %u", inode->i_ino, inode->i_nlink); + + /* + * If the inode is being deleted, do not write the attached data. No + * need to synchronize the write-buffer either. + */ + if (!last_reference) { + len += ui->data_len; + sync = IS_SYNC(inode); + } + ino = kmalloc(len, GFP_NOFS); + if (!ino) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, len); + if (err) + goto out_free; + + pack_inode(c, ino, inode, 1); + err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync); + if (err) + goto out_release; + if (!sync) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, + inode->i_ino); + release_head(c, BASEHD); + + if (last_reference) { + err = ubifs_tnc_remove_ino(c, inode->i_ino); + if (err) + goto out_ro; + ubifs_delete_orphan(c, inode->i_ino); + err = ubifs_add_dirt(c, lnum, len); + } else { + union ubifs_key key; + + ino_key_init(c, &key, inode->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, len); + } + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&ui->ui_lock); + ui->synced_i_size = ui->ui_size; + spin_unlock(&ui->ui_lock); + kfree(ino); + return 0; + +out_release: + release_head(c, BASEHD); +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); +out_free: + kfree(ino); + return err; +} + +/** + * ubifs_jnl_delete_inode - delete an inode. + * @c: UBIFS file-system description object + * @inode: inode to delete + * + * This function deletes inode @inode which includes removing it from orphans, + * deleting it from TNC and, in some cases, writing a deletion inode to the + * journal. + * + * When regular file inodes are unlinked or a directory inode is removed, the + * 'ubifs_jnl_update()' function writes a corresponding deletion inode and + * direntry to the media, and adds the inode to orphans. After this, when the + * last reference to this inode has been dropped, this function is called. In + * general, it has to write one more deletion inode to the media, because if + * a commit happened between 'ubifs_jnl_update()' and + * 'ubifs_jnl_delete_inode()', the deletion inode is not in the journal + * anymore, and in fact it might not be on the flash anymore, because it might + * have been garbage-collected already. And for optimization reasons UBIFS does + * not read the orphan area if it has been unmounted cleanly, so it would have + * no indication in the journal that there is a deleted inode which has to be + * removed from TNC. + * + * However, if there was no commit between 'ubifs_jnl_update()' and + * 'ubifs_jnl_delete_inode()', then there is no need to write the deletion + * inode to the media for the second time. And this is quite a typical case. + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode) +{ + int err; + struct ubifs_inode *ui = ubifs_inode(inode); + + ubifs_assert(inode->i_nlink == 0); + + if (ui->del_cmtno != c->cmt_no) + /* A commit happened for sure */ + return ubifs_jnl_write_inode(c, inode); + + down_read(&c->commit_sem); + /* + * Check commit number again, because the first test has been done + * without @c->commit_sem, so a commit might have happened. + */ + if (ui->del_cmtno != c->cmt_no) { + up_read(&c->commit_sem); + return ubifs_jnl_write_inode(c, inode); + } + + err = ubifs_tnc_remove_ino(c, inode->i_ino); + if (err) + ubifs_ro_mode(c, err); + else + ubifs_delete_orphan(c, inode->i_ino); + up_read(&c->commit_sem); + return err; +} + +/** + * ubifs_jnl_rename - rename a directory entry. + * @c: UBIFS file-system description object + * @old_dir: parent inode of directory entry to rename + * @old_dentry: directory entry to rename + * @new_dir: parent inode of directory entry to rename + * @new_dentry: new directory entry (or directory entry to replace) + * @sync: non-zero if the write-buffer has to be synchronized + * + * This function implements the re-name operation which may involve writing up + * to 3 inodes and 2 directory entries. It marks the written inodes as clean + * and returns zero on success. In case of failure, a negative error code is + * returned. + */ +int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, + const struct dentry *old_dentry, + const struct inode *new_dir, + const struct dentry *new_dentry, int sync) +{ + void *p; + union ubifs_key key; + struct ubifs_dent_node *dent, *dent2; + int err, dlen1, dlen2, ilen, lnum, offs, len; + const struct inode *old_inode = old_dentry->d_inode; + const struct inode *new_inode = new_dentry->d_inode; + int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ; + int last_reference = !!(new_inode && new_inode->i_nlink == 0); + int move = (old_dir != new_dir); + struct ubifs_inode *uninitialized_var(new_ui); + + dbg_jnl("dent '%.*s' in dir ino %lu to dent '%.*s' in dir ino %lu", + old_dentry->d_name.len, old_dentry->d_name.name, + old_dir->i_ino, new_dentry->d_name.len, + new_dentry->d_name.name, new_dir->i_ino); + ubifs_assert(ubifs_inode(old_dir)->data_len == 0); + ubifs_assert(ubifs_inode(new_dir)->data_len == 0); + ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex)); + ubifs_assert(mutex_is_locked(&ubifs_inode(new_dir)->ui_mutex)); + + dlen1 = UBIFS_DENT_NODE_SZ + new_dentry->d_name.len + 1; + dlen2 = UBIFS_DENT_NODE_SZ + old_dentry->d_name.len + 1; + if (new_inode) { + new_ui = ubifs_inode(new_inode); + ubifs_assert(mutex_is_locked(&new_ui->ui_mutex)); + ilen = UBIFS_INO_NODE_SZ; + if (!last_reference) + ilen += new_ui->data_len; + } else + ilen = 0; + + aligned_dlen1 = ALIGN(dlen1, 8); + aligned_dlen2 = ALIGN(dlen2, 8); + len = aligned_dlen1 + aligned_dlen2 + ALIGN(ilen, 8) + ALIGN(plen, 8); + if (old_dir != new_dir) + len += plen; + dent = kmalloc(len, GFP_NOFS); + if (!dent) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, len); + if (err) + goto out_free; + + /* Make new dent */ + dent->ch.node_type = UBIFS_DENT_NODE; + dent_key_init_flash(c, &dent->key, new_dir->i_ino, &new_dentry->d_name); + dent->inum = cpu_to_le64(old_inode->i_ino); + dent->type = get_dent_type(old_inode->i_mode); + dent->nlen = cpu_to_le16(new_dentry->d_name.len); + memcpy(dent->name, new_dentry->d_name.name, new_dentry->d_name.len); + dent->name[new_dentry->d_name.len] = '\0'; + zero_dent_node_unused(dent); + ubifs_prep_grp_node(c, dent, dlen1, 0); + + /* Make deletion dent */ + dent2 = (void *)dent + aligned_dlen1; + dent2->ch.node_type = UBIFS_DENT_NODE; + dent_key_init_flash(c, &dent2->key, old_dir->i_ino, + &old_dentry->d_name); + dent2->inum = 0; + dent2->type = DT_UNKNOWN; + dent2->nlen = cpu_to_le16(old_dentry->d_name.len); + memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len); + dent2->name[old_dentry->d_name.len] = '\0'; + zero_dent_node_unused(dent2); + ubifs_prep_grp_node(c, dent2, dlen2, 0); + + p = (void *)dent2 + aligned_dlen2; + if (new_inode) { + pack_inode(c, p, new_inode, 0); + p += ALIGN(ilen, 8); + } + + if (!move) + pack_inode(c, p, old_dir, 1); + else { + pack_inode(c, p, old_dir, 0); + p += ALIGN(plen, 8); + pack_inode(c, p, new_dir, 1); + } + + if (last_reference) { + err = ubifs_add_orphan(c, new_inode->i_ino); + if (err) { + release_head(c, BASEHD); + goto out_finish; + } + new_ui->del_cmtno = c->cmt_no; + } + + err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync); + if (err) + goto out_release; + if (!sync) { + struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf; + + ubifs_wbuf_add_ino_nolock(wbuf, new_dir->i_ino); + ubifs_wbuf_add_ino_nolock(wbuf, old_dir->i_ino); + if (new_inode) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, + new_inode->i_ino); + } + release_head(c, BASEHD); + + dent_key_init(c, &key, new_dir->i_ino, &new_dentry->d_name); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &new_dentry->d_name); + if (err) + goto out_ro; + + err = ubifs_add_dirt(c, lnum, dlen2); + if (err) + goto out_ro; + + dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name); + err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name); + if (err) + goto out_ro; + + offs += aligned_dlen1 + aligned_dlen2; + if (new_inode) { + ino_key_init(c, &key, new_inode->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, ilen); + if (err) + goto out_ro; + offs += ALIGN(ilen, 8); + } + + ino_key_init(c, &key, old_dir->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, plen); + if (err) + goto out_ro; + + if (old_dir != new_dir) { + offs += ALIGN(plen, 8); + ino_key_init(c, &key, new_dir->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, plen); + if (err) + goto out_ro; + } + + finish_reservation(c); + if (new_inode) { + mark_inode_clean(c, new_ui); + spin_lock(&new_ui->ui_lock); + new_ui->synced_i_size = new_ui->ui_size; + spin_unlock(&new_ui->ui_lock); + } + mark_inode_clean(c, ubifs_inode(old_dir)); + if (move) + mark_inode_clean(c, ubifs_inode(new_dir)); + kfree(dent); + return 0; + +out_release: + release_head(c, BASEHD); +out_ro: + ubifs_ro_mode(c, err); + if (last_reference) + ubifs_delete_orphan(c, new_inode->i_ino); +out_finish: + finish_reservation(c); +out_free: + kfree(dent); + return err; +} + +/** + * recomp_data_node - re-compress a truncated data node. + * @dn: data node to re-compress + * @new_len: new length + * + * This function is used when an inode is truncated and the last data node of + * the inode has to be re-compressed and re-written. + */ +static int recomp_data_node(struct ubifs_data_node *dn, int *new_len) +{ + void *buf; + int err, len, compr_type, out_len; + + out_len = le32_to_cpu(dn->size); + buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS); + if (!buf) + return -ENOMEM; + + len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + compr_type = le16_to_cpu(dn->compr_type); + err = ubifs_decompress(&dn->data, len, buf, &out_len, compr_type); + if (err) + goto out; + + ubifs_compress(buf, *new_len, &dn->data, &out_len, &compr_type); + ubifs_assert(out_len <= UBIFS_BLOCK_SIZE); + dn->compr_type = cpu_to_le16(compr_type); + dn->size = cpu_to_le32(*new_len); + *new_len = UBIFS_DATA_NODE_SZ + out_len; +out: + kfree(buf); + return err; +} + +/** + * ubifs_jnl_truncate - update the journal for a truncation. + * @c: UBIFS file-system description object + * @inode: inode to truncate + * @old_size: old size + * @new_size: new size + * + * When the size of a file decreases due to truncation, a truncation node is + * written, the journal tree is updated, and the last data block is re-written + * if it has been affected. The inode is also updated in order to synchronize + * the new inode size. + * + * This function marks the inode as clean and returns zero on success. In case + * of failure, a negative error code is returned. + */ +int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, + loff_t old_size, loff_t new_size) +{ + union ubifs_key key, to_key; + struct ubifs_ino_node *ino; + struct ubifs_trun_node *trun; + struct ubifs_data_node *uninitialized_var(dn); + int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode); + struct ubifs_inode *ui = ubifs_inode(inode); + ino_t inum = inode->i_ino; + unsigned int blk; + + dbg_jnl("ino %lu, size %lld -> %lld", + (unsigned long)inum, old_size, new_size); + ubifs_assert(!ui->data_len); + ubifs_assert(S_ISREG(inode->i_mode)); + ubifs_assert(mutex_is_locked(&ui->ui_mutex)); + + sz = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ + + UBIFS_MAX_DATA_NODE_SZ * WORST_COMPR_FACTOR; + ino = kmalloc(sz, GFP_NOFS); + if (!ino) + return -ENOMEM; + + trun = (void *)ino + UBIFS_INO_NODE_SZ; + trun->ch.node_type = UBIFS_TRUN_NODE; + trun->inum = cpu_to_le32(inum); + trun->old_size = cpu_to_le64(old_size); + trun->new_size = cpu_to_le64(new_size); + zero_trun_node_unused(trun); + + dlen = new_size & (UBIFS_BLOCK_SIZE - 1); + if (dlen) { + /* Get last data block so it can be truncated */ + dn = (void *)trun + UBIFS_TRUN_NODE_SZ; + blk = new_size >> UBIFS_BLOCK_SHIFT; + data_key_init(c, &key, inum, blk); + dbg_jnl("last block key %s", DBGKEY(&key)); + err = ubifs_tnc_lookup(c, &key, dn); + if (err == -ENOENT) + dlen = 0; /* Not found (so it is a hole) */ + else if (err) + goto out_free; + else { + if (le32_to_cpu(dn->size) <= dlen) + dlen = 0; /* Nothing to do */ + else { + int compr_type = le16_to_cpu(dn->compr_type); + + if (compr_type != UBIFS_COMPR_NONE) { + err = recomp_data_node(dn, &dlen); + if (err) + goto out_free; + } else { + dn->size = cpu_to_le32(dlen); + dlen += UBIFS_DATA_NODE_SZ; + } + zero_data_node_unused(dn); + } + } + } + + /* Must make reservation before allocating sequence numbers */ + len = UBIFS_TRUN_NODE_SZ + UBIFS_INO_NODE_SZ; + if (dlen) + len += dlen; + err = make_reservation(c, BASEHD, len); + if (err) + goto out_free; + + pack_inode(c, ino, inode, 0); + ubifs_prep_grp_node(c, trun, UBIFS_TRUN_NODE_SZ, dlen ? 0 : 1); + if (dlen) + ubifs_prep_grp_node(c, dn, dlen, 1); + + err = write_head(c, BASEHD, ino, len, &lnum, &offs, sync); + if (err) + goto out_release; + if (!sync) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, inum); + release_head(c, BASEHD); + + if (dlen) { + sz = offs + UBIFS_INO_NODE_SZ + UBIFS_TRUN_NODE_SZ; + err = ubifs_tnc_add(c, &key, lnum, sz, dlen); + if (err) + goto out_ro; + } + + ino_key_init(c, &key, inum); + err = ubifs_tnc_add(c, &key, lnum, offs, UBIFS_INO_NODE_SZ); + if (err) + goto out_ro; + + err = ubifs_add_dirt(c, lnum, UBIFS_TRUN_NODE_SZ); + if (err) + goto out_ro; + + bit = new_size & (UBIFS_BLOCK_SIZE - 1); + blk = (new_size >> UBIFS_BLOCK_SHIFT) + (bit ? 1 : 0); + data_key_init(c, &key, inum, blk); + + bit = old_size & (UBIFS_BLOCK_SIZE - 1); + blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0 : 1); + data_key_init(c, &to_key, inum, blk); + + err = ubifs_tnc_remove_range(c, &key, &to_key); + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&ui->ui_lock); + ui->synced_i_size = ui->ui_size; + spin_unlock(&ui->ui_lock); + mark_inode_clean(c, ui); + kfree(ino); + return 0; + +out_release: + release_head(c, BASEHD); +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); +out_free: + kfree(ino); + return err; +} + +#ifdef CONFIG_UBIFS_FS_XATTR + +/** + * ubifs_jnl_delete_xattr - delete an extended attribute. + * @c: UBIFS file-system description object + * @host: host inode + * @inode: extended attribute inode + * @nm: extended attribute entry name + * + * This function delete an extended attribute which is very similar to + * un-linking regular files - it writes a deletion xentry, a deletion inode and + * updates the target inode. Returns zero in case of success and a negative + * error code in case of failure. + */ +int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, + const struct inode *inode, const struct qstr *nm) +{ + int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen; + struct ubifs_dent_node *xent; + struct ubifs_ino_node *ino; + union ubifs_key xent_key, key1, key2; + int sync = IS_DIRSYNC(host); + struct ubifs_inode *host_ui = ubifs_inode(host); + + dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d", + host->i_ino, inode->i_ino, nm->name, + ubifs_inode(inode)->data_len); + ubifs_assert(inode->i_nlink == 0); + ubifs_assert(mutex_is_locked(&host_ui->ui_mutex)); + + /* + * Since we are deleting the inode, we do not bother to attach any data + * to it and assume its length is %UBIFS_INO_NODE_SZ. + */ + xlen = UBIFS_DENT_NODE_SZ + nm->len + 1; + aligned_xlen = ALIGN(xlen, 8); + hlen = host_ui->data_len + UBIFS_INO_NODE_SZ; + len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8); + + xent = kmalloc(len, GFP_NOFS); + if (!xent) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, len); + if (err) { + kfree(xent); + return err; + } + + xent->ch.node_type = UBIFS_XENT_NODE; + xent_key_init(c, &xent_key, host->i_ino, nm); + key_write(c, &xent_key, xent->key); + xent->inum = 0; + xent->type = get_dent_type(inode->i_mode); + xent->nlen = cpu_to_le16(nm->len); + memcpy(xent->name, nm->name, nm->len); + xent->name[nm->len] = '\0'; + zero_dent_node_unused(xent); + ubifs_prep_grp_node(c, xent, xlen, 0); + + ino = (void *)xent + aligned_xlen; + pack_inode(c, ino, inode, 0); + ino = (void *)ino + UBIFS_INO_NODE_SZ; + pack_inode(c, ino, host, 1); + + err = write_head(c, BASEHD, xent, len, &lnum, &xent_offs, sync); + if (!sync && !err) + ubifs_wbuf_add_ino_nolock(&c->jheads[BASEHD].wbuf, host->i_ino); + release_head(c, BASEHD); + kfree(xent); + if (err) + goto out_ro; + + /* Remove the extended attribute entry from TNC */ + err = ubifs_tnc_remove_nm(c, &xent_key, nm); + if (err) + goto out_ro; + err = ubifs_add_dirt(c, lnum, xlen); + if (err) + goto out_ro; + + /* + * Remove all nodes belonging to the extended attribute inode from TNC. + * Well, there actually must be only one node - the inode itself. + */ + lowest_ino_key(c, &key1, inode->i_ino); + highest_ino_key(c, &key2, inode->i_ino); + err = ubifs_tnc_remove_range(c, &key1, &key2); + if (err) + goto out_ro; + err = ubifs_add_dirt(c, lnum, UBIFS_INO_NODE_SZ); + if (err) + goto out_ro; + + /* And update TNC with the new host inode position */ + ino_key_init(c, &key1, host->i_ino); + err = ubifs_tnc_add(c, &key1, lnum, xent_offs + len - hlen, hlen); + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&host_ui->ui_lock); + host_ui->synced_i_size = host_ui->ui_size; + spin_unlock(&host_ui->ui_lock); + mark_inode_clean(c, host_ui); + return 0; + +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); + return err; +} + +/** + * ubifs_jnl_change_xattr - change an extended attribute. + * @c: UBIFS file-system description object + * @inode: extended attribute inode + * @host: host inode + * + * This function writes the updated version of an extended attribute inode and + * the host inode to the journal (to the base head). The host inode is written + * after the extended attribute inode in order to guarantee that the extended + * attribute will be flushed when the inode is synchronized by 'fsync()' and + * consequently, the write-buffer is synchronized. This function returns zero + * in case of success and a negative error code in case of failure. + */ +int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, + const struct inode *host) +{ + int err, len1, len2, aligned_len, aligned_len1, lnum, offs; + struct ubifs_inode *host_ui = ubifs_inode(host); + struct ubifs_ino_node *ino; + union ubifs_key key; + int sync = IS_DIRSYNC(host); + + dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino); + ubifs_assert(host->i_nlink > 0); + ubifs_assert(inode->i_nlink > 0); + ubifs_assert(mutex_is_locked(&host_ui->ui_mutex)); + + len1 = UBIFS_INO_NODE_SZ + host_ui->data_len; + len2 = UBIFS_INO_NODE_SZ + ubifs_inode(inode)->data_len; + aligned_len1 = ALIGN(len1, 8); + aligned_len = aligned_len1 + ALIGN(len2, 8); + + ino = kmalloc(aligned_len, GFP_NOFS); + if (!ino) + return -ENOMEM; + + /* Make reservation before allocating sequence numbers */ + err = make_reservation(c, BASEHD, aligned_len); + if (err) + goto out_free; + + pack_inode(c, ino, host, 0); + pack_inode(c, (void *)ino + aligned_len1, inode, 1); + + err = write_head(c, BASEHD, ino, aligned_len, &lnum, &offs, 0); + if (!sync && !err) { + struct ubifs_wbuf *wbuf = &c->jheads[BASEHD].wbuf; + + ubifs_wbuf_add_ino_nolock(wbuf, host->i_ino); + ubifs_wbuf_add_ino_nolock(wbuf, inode->i_ino); + } + release_head(c, BASEHD); + if (err) + goto out_ro; + + ino_key_init(c, &key, host->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs, len1); + if (err) + goto out_ro; + + ino_key_init(c, &key, inode->i_ino); + err = ubifs_tnc_add(c, &key, lnum, offs + aligned_len1, len2); + if (err) + goto out_ro; + + finish_reservation(c); + spin_lock(&host_ui->ui_lock); + host_ui->synced_i_size = host_ui->ui_size; + spin_unlock(&host_ui->ui_lock); + mark_inode_clean(c, host_ui); + kfree(ino); + return 0; + +out_ro: + ubifs_ro_mode(c, err); + finish_reservation(c); +out_free: + kfree(ino); + return err; +} + +#endif /* CONFIG_UBIFS_FS_XATTR */ diff -Nurd linux-2.6.24/fs/ubifs/key.h ubifs-v2.6.24/fs/ubifs/key.h --- linux-2.6.24/fs/ubifs/key.h 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/key.h 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,557 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This header contains various key-related definitions and helper function. + * UBIFS allows several key schemes, so we access key fields only via these + * helpers. At the moment only one key scheme is supported. + * + * Simple key scheme + * ~~~~~~~~~~~~~~~~~ + * + * Keys are 64-bits long. First 32-bits are inode number (parent inode number + * in case of direntry key). Next 3 bits are node type. The last 29 bits are + * 4KiB offset in case of inode node, and direntry hash in case of a direntry + * node. We use "r5" hash borrowed from reiserfs. + */ + +#ifndef __UBIFS_KEY_H__ +#define __UBIFS_KEY_H__ + +/** + * key_mask_hash - mask a valid hash value. + * @val: value to be masked + * + * We use hash values as offset in directories, so values %0 and %1 are + * reserved for "." and "..". %2 is reserved for "end of readdir" marker. This + * function makes sure the reserved values are not used. + */ +static inline uint32_t key_mask_hash(uint32_t hash) +{ + hash &= UBIFS_S_KEY_HASH_MASK; + if (unlikely(hash <= 2)) + hash += 3; + return hash; +} + +/** + * key_r5_hash - R5 hash function (borrowed from reiserfs). + * @s: direntry name + * @len: name length + */ +static inline uint32_t key_r5_hash(const char *s, int len) +{ + uint32_t a = 0; + const signed char *str = (const signed char *)s; + + while (*str) { + a += *str << 4; + a += *str >> 4; + a *= 11; + str++; + } + + return key_mask_hash(a); +} + +/** + * key_test_hash - testing hash function. + * @str: direntry name + * @len: name length + */ +static inline uint32_t key_test_hash(const char *str, int len) +{ + uint32_t a = 0; + + len = min_t(uint32_t, len, 4); + memcpy(&a, str, len); + return key_mask_hash(a); +} + +/** + * ino_key_init - initialize inode key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + */ +static inline void ino_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS; +} + +/** + * ino_key_init_flash - initialize on-flash inode key. + * @c: UBIFS file-system description object + * @k: key to initialize + * @inum: inode number + */ +static inline void ino_key_init_flash(const struct ubifs_info *c, void *k, + ino_t inum) +{ + union ubifs_key *key = k; + + key->j32[0] = cpu_to_le32(inum); + key->j32[1] = cpu_to_le32(UBIFS_INO_KEY << UBIFS_S_KEY_BLOCK_BITS); + memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8); +} + +/** + * lowest_ino_key - get the lowest possible inode key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + */ +static inline void lowest_ino_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = 0; +} + +/** + * highest_ino_key - get the highest possible inode key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + */ +static inline void highest_ino_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = 0xffffffff; +} + +/** + * dent_key_init - initialize directory entry key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: parent inode number + * @nm: direntry name and length + */ +static inline void dent_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum, + const struct qstr *nm) +{ + uint32_t hash = c->key_hash(nm->name, nm->len); + + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->u32[0] = inum; + key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS); +} + +/** + * dent_key_init_hash - initialize directory entry key without re-calculating + * hash function. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: parent inode number + * @hash: direntry name hash + */ +static inline void dent_key_init_hash(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum, + uint32_t hash) +{ + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->u32[0] = inum; + key->u32[1] = hash | (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS); +} + +/** + * dent_key_init_flash - initialize on-flash directory entry key. + * @c: UBIFS file-system description object + * @k: key to initialize + * @inum: parent inode number + * @nm: direntry name and length + */ +static inline void dent_key_init_flash(const struct ubifs_info *c, void *k, + ino_t inum, const struct qstr *nm) +{ + union ubifs_key *key = k; + uint32_t hash = c->key_hash(nm->name, nm->len); + + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->j32[0] = cpu_to_le32(inum); + key->j32[1] = cpu_to_le32(hash | + (UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS)); + memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8); +} + +/** + * lowest_dent_key - get the lowest possible directory entry key. + * @c: UBIFS file-system description object + * @key: where to store the lowest key + * @inum: parent inode number + */ +static inline void lowest_dent_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = UBIFS_DENT_KEY << UBIFS_S_KEY_HASH_BITS; +} + +/** + * xent_key_init - initialize extended attribute entry key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: host inode number + * @nm: extended attribute entry name and length + */ +static inline void xent_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum, + const struct qstr *nm) +{ + uint32_t hash = c->key_hash(nm->name, nm->len); + + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->u32[0] = inum; + key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS); +} + +/** + * xent_key_init_hash - initialize extended attribute entry key without + * re-calculating hash function. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: host inode number + * @hash: extended attribute entry name hash + */ +static inline void xent_key_init_hash(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum, + uint32_t hash) +{ + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->u32[0] = inum; + key->u32[1] = hash | (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS); +} + +/** + * xent_key_init_flash - initialize on-flash extended attribute entry key. + * @c: UBIFS file-system description object + * @k: key to initialize + * @inum: host inode number + * @nm: extended attribute entry name and length + */ +static inline void xent_key_init_flash(const struct ubifs_info *c, void *k, + ino_t inum, const struct qstr *nm) +{ + union ubifs_key *key = k; + uint32_t hash = c->key_hash(nm->name, nm->len); + + ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); + key->j32[0] = cpu_to_le32(inum); + key->j32[1] = cpu_to_le32(hash | + (UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS)); + memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8); +} + +/** + * lowest_xent_key - get the lowest possible extended attribute entry key. + * @c: UBIFS file-system description object + * @key: where to store the lowest key + * @inum: host inode number + */ +static inline void lowest_xent_key(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = UBIFS_XENT_KEY << UBIFS_S_KEY_HASH_BITS; +} + +/** + * data_key_init - initialize data key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + * @block: block number + */ +static inline void data_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum, + unsigned int block) +{ + ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK)); + key->u32[0] = inum; + key->u32[1] = block | (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS); +} + +/** + * data_key_init_flash - initialize on-flash data key. + * @c: UBIFS file-system description object + * @k: key to initialize + * @inum: inode number + * @block: block number + */ +static inline void data_key_init_flash(const struct ubifs_info *c, void *k, + ino_t inum, unsigned int block) +{ + union ubifs_key *key = k; + + ubifs_assert(!(block & ~UBIFS_S_KEY_BLOCK_MASK)); + key->j32[0] = cpu_to_le32(inum); + key->j32[1] = cpu_to_le32(block | + (UBIFS_DATA_KEY << UBIFS_S_KEY_BLOCK_BITS)); + memset(k + 8, 0, UBIFS_MAX_KEY_LEN - 8); +} + +/** + * trun_key_init - initialize truncation node key. + * @c: UBIFS file-system description object + * @key: key to initialize + * @inum: inode number + * + * Note, UBIFS does not have truncation keys on the media and this function is + * only used for purposes of replay. + */ +static inline void trun_key_init(const struct ubifs_info *c, + union ubifs_key *key, ino_t inum) +{ + key->u32[0] = inum; + key->u32[1] = UBIFS_TRUN_KEY << UBIFS_S_KEY_BLOCK_BITS; +} + +/** + * key_type - get key type. + * @c: UBIFS file-system description object + * @key: key to get type of + */ +static inline int key_type(const struct ubifs_info *c, + const union ubifs_key *key) +{ + return key->u32[1] >> UBIFS_S_KEY_BLOCK_BITS; +} + +/** + * key_type_flash - get type of a on-flash formatted key. + * @c: UBIFS file-system description object + * @k: key to get type of + */ +static inline int key_type_flash(const struct ubifs_info *c, const void *k) +{ + const union ubifs_key *key = k; + + return le32_to_cpu(key->j32[1]) >> UBIFS_S_KEY_BLOCK_BITS; +} + +/** + * key_inum - fetch inode number from key. + * @c: UBIFS file-system description object + * @k: key to fetch inode number from + */ +static inline ino_t key_inum(const struct ubifs_info *c, const void *k) +{ + const union ubifs_key *key = k; + + return key->u32[0]; +} + +/** + * key_inum_flash - fetch inode number from an on-flash formatted key. + * @c: UBIFS file-system description object + * @k: key to fetch inode number from + */ +static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k) +{ + const union ubifs_key *key = k; + + return le32_to_cpu(key->j32[0]); +} + +/** + * key_hash - get directory entry hash. + * @c: UBIFS file-system description object + * @key: the key to get hash from + */ +static inline uint32_t key_hash(const struct ubifs_info *c, + const union ubifs_key *key) +{ + return key->u32[1] & UBIFS_S_KEY_HASH_MASK; +} + +/** + * key_hash_flash - get directory entry hash from an on-flash formatted key. + * @c: UBIFS file-system description object + * @k: the key to get hash from + */ +static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k) +{ + const union ubifs_key *key = k; + + return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_HASH_MASK; +} + +/** + * key_block - get data block number. + * @c: UBIFS file-system description object + * @key: the key to get the block number from + */ +static inline unsigned int key_block(const struct ubifs_info *c, + const union ubifs_key *key) +{ + return key->u32[1] & UBIFS_S_KEY_BLOCK_MASK; +} + +/** + * key_block_flash - get data block number from an on-flash formatted key. + * @c: UBIFS file-system description object + * @k: the key to get the block number from + */ +static inline unsigned int key_block_flash(const struct ubifs_info *c, + const void *k) +{ + const union ubifs_key *key = k; + + return le32_to_cpu(key->j32[1]) & UBIFS_S_KEY_BLOCK_MASK; +} + +/** + * key_read - transform a key to in-memory format. + * @c: UBIFS file-system description object + * @from: the key to transform + * @to: the key to store the result + */ +static inline void key_read(const struct ubifs_info *c, const void *from, + union ubifs_key *to) +{ + const union ubifs_key *f = from; + + to->u32[0] = le32_to_cpu(f->j32[0]); + to->u32[1] = le32_to_cpu(f->j32[1]); +} + +/** + * key_write - transform a key from in-memory format. + * @c: UBIFS file-system description object + * @from: the key to transform + * @to: the key to store the result + */ +static inline void key_write(const struct ubifs_info *c, + const union ubifs_key *from, void *to) +{ + union ubifs_key *t = to; + + t->j32[0] = cpu_to_le32(from->u32[0]); + t->j32[1] = cpu_to_le32(from->u32[1]); + memset(to + 8, 0, UBIFS_MAX_KEY_LEN - 8); +} + +/** + * key_write_idx - transform a key from in-memory format for the index. + * @c: UBIFS file-system description object + * @from: the key to transform + * @to: the key to store the result + */ +static inline void key_write_idx(const struct ubifs_info *c, + const union ubifs_key *from, void *to) +{ + union ubifs_key *t = to; + + t->j32[0] = cpu_to_le32(from->u32[0]); + t->j32[1] = cpu_to_le32(from->u32[1]); +} + +/** + * key_copy - copy a key. + * @c: UBIFS file-system description object + * @from: the key to copy from + * @to: the key to copy to + */ +static inline void key_copy(const struct ubifs_info *c, + const union ubifs_key *from, union ubifs_key *to) +{ + to->u64[0] = from->u64[0]; +} + +/** + * keys_cmp - compare keys. + * @c: UBIFS file-system description object + * @key1: the first key to compare + * @key2: the second key to compare + * + * This function compares 2 keys and returns %-1 if @key1 is less than + * @key2, %0 if the keys are equivalent and %1 if @key1 is greater than @key2. + */ +static inline int keys_cmp(const struct ubifs_info *c, + const union ubifs_key *key1, + const union ubifs_key *key2) +{ + if (key1->u32[0] < key2->u32[0]) + return -1; + if (key1->u32[0] > key2->u32[0]) + return 1; + if (key1->u32[1] < key2->u32[1]) + return -1; + if (key1->u32[1] > key2->u32[1]) + return 1; + + return 0; +} + +/** + * keys_eq - determine if keys are equivalent. + * @c: UBIFS file-system description object + * @key1: the first key to compare + * @key2: the second key to compare + * + * This function compares 2 keys and returns %1 if @key1 is equal to @key2 and + * %0 if not. + */ +static inline int keys_eq(const struct ubifs_info *c, + const union ubifs_key *key1, + const union ubifs_key *key2) +{ + if (key1->u32[0] != key2->u32[0]) + return 0; + if (key1->u32[1] != key2->u32[1]) + return 0; + return 1; +} + +/** + * is_hash_key - is a key vulnerable to hash collisions. + * @c: UBIFS file-system description object + * @key: key + * + * This function returns %1 if @key is a hashed key or %0 otherwise. + */ +static inline int is_hash_key(const struct ubifs_info *c, + const union ubifs_key *key) +{ + int type = key_type(c, key); + + return type == UBIFS_DENT_KEY || type == UBIFS_XENT_KEY; +} + +/** + * key_max_inode_size - get maximum file size allowed by current key format. + * @c: UBIFS file-system description object + */ +static inline unsigned long long key_max_inode_size(const struct ubifs_info *c) +{ + switch (c->key_fmt) { + case UBIFS_SIMPLE_KEY_FMT: + return (1ULL << UBIFS_S_KEY_BLOCK_BITS) * UBIFS_BLOCK_SIZE; + default: + return 0; + } +} +#endif /* !__UBIFS_KEY_H__ */ diff -Nurd linux-2.6.24/fs/ubifs/log.c ubifs-v2.6.24/fs/ubifs/log.c --- linux-2.6.24/fs/ubifs/log.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/log.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,806 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file is a part of UBIFS journal implementation and contains various + * functions which manipulate the log. The log is a fixed area on the flash + * which does not contain any data but refers to buds. The log is a part of the + * journal. + */ + +#include "ubifs.h" + +#ifdef CONFIG_UBIFS_FS_DEBUG +static int dbg_check_bud_bytes(struct ubifs_info *c); +#else +#define dbg_check_bud_bytes(c) 0 +#endif + +/** + * ubifs_search_bud - search bud LEB. + * @c: UBIFS file-system description object + * @lnum: logical eraseblock number to search + * + * This function searches bud LEB @lnum. Returns bud description object in case + * of success and %NULL if there is no bud with this LEB number. + */ +struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum) +{ + struct rb_node *p; + struct ubifs_bud *bud; + + spin_lock(&c->buds_lock); + p = c->buds.rb_node; + while (p) { + bud = rb_entry(p, struct ubifs_bud, rb); + if (lnum < bud->lnum) + p = p->rb_left; + else if (lnum > bud->lnum) + p = p->rb_right; + else { + spin_unlock(&c->buds_lock); + return bud; + } + } + spin_unlock(&c->buds_lock); + return NULL; +} + +/** + * ubifs_get_wbuf - get the wbuf associated with a LEB, if there is one. + * @c: UBIFS file-system description object + * @lnum: logical eraseblock number to search + * + * This functions returns the wbuf for @lnum or %NULL if there is not one. + */ +struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum) +{ + struct rb_node *p; + struct ubifs_bud *bud; + int jhead; + + if (!c->jheads) + return NULL; + + spin_lock(&c->buds_lock); + p = c->buds.rb_node; + while (p) { + bud = rb_entry(p, struct ubifs_bud, rb); + if (lnum < bud->lnum) + p = p->rb_left; + else if (lnum > bud->lnum) + p = p->rb_right; + else { + jhead = bud->jhead; + spin_unlock(&c->buds_lock); + return &c->jheads[jhead].wbuf; + } + } + spin_unlock(&c->buds_lock); + return NULL; +} + +/** + * next_log_lnum - switch to the next log LEB. + * @c: UBIFS file-system description object + * @lnum: current log LEB + */ +static inline int next_log_lnum(const struct ubifs_info *c, int lnum) +{ + lnum += 1; + if (lnum > c->log_last) + lnum = UBIFS_LOG_LNUM; + + return lnum; +} + +/** + * empty_log_bytes - calculate amount of empty space in the log. + * @c: UBIFS file-system description object + */ +static inline long long empty_log_bytes(const struct ubifs_info *c) +{ + long long h, t; + + h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs; + t = (long long)c->ltail_lnum * c->leb_size; + + if (h >= t) + return c->log_bytes - h + t; + else + return t - h; +} + +/** + * ubifs_add_bud - add bud LEB to the tree of buds and its journal head list. + * @c: UBIFS file-system description object + * @bud: the bud to add + */ +void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud) +{ + struct rb_node **p, *parent = NULL; + struct ubifs_bud *b; + struct ubifs_jhead *jhead; + + spin_lock(&c->buds_lock); + p = &c->buds.rb_node; + while (*p) { + parent = *p; + b = rb_entry(parent, struct ubifs_bud, rb); + ubifs_assert(bud->lnum != b->lnum); + if (bud->lnum < b->lnum) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(&bud->rb, parent, p); + rb_insert_color(&bud->rb, &c->buds); + if (c->jheads) { + jhead = &c->jheads[bud->jhead]; + list_add_tail(&bud->list, &jhead->buds_list); + } else + ubifs_assert(c->replaying && (c->vfs_sb->s_flags & MS_RDONLY)); + + /* + * Note, although this is a new bud, we anyway account this space now, + * before any data has been written to it, because this is about to + * guarantee fixed mount time, and this bud will anyway be read and + * scanned. + */ + c->bud_bytes += c->leb_size - bud->start; + + dbg_log("LEB %d:%d, jhead %d, bud_bytes %lld", bud->lnum, + bud->start, bud->jhead, c->bud_bytes); + spin_unlock(&c->buds_lock); +} + +/** + * ubifs_create_buds_lists - create journal head buds lists for remount rw. + * @c: UBIFS file-system description object + */ +void ubifs_create_buds_lists(struct ubifs_info *c) +{ + struct rb_node *p; + + spin_lock(&c->buds_lock); + p = rb_first(&c->buds); + while (p) { + struct ubifs_bud *bud = rb_entry(p, struct ubifs_bud, rb); + struct ubifs_jhead *jhead = &c->jheads[bud->jhead]; + + list_add_tail(&bud->list, &jhead->buds_list); + p = rb_next(p); + } + spin_unlock(&c->buds_lock); +} + +/** + * ubifs_add_bud_to_log - add a new bud to the log. + * @c: UBIFS file-system description object + * @jhead: journal head the bud belongs to + * @lnum: LEB number of the bud + * @offs: starting offset of the bud + * + * This function writes reference node for the new bud LEB @lnum it to the log, + * and adds it to the buds tress. It also makes sure that log size does not + * exceed the 'c->max_bud_bytes' limit. Returns zero in case of success, + * %-EAGAIN if commit is required, and a negative error codes in case of + * failure. + */ +int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs) +{ + int err; + struct ubifs_bud *bud; + struct ubifs_ref_node *ref; + + bud = kmalloc(sizeof(struct ubifs_bud), GFP_NOFS); + if (!bud) + return -ENOMEM; + ref = kzalloc(c->ref_node_alsz, GFP_NOFS); + if (!ref) { + kfree(bud); + return -ENOMEM; + } + + mutex_lock(&c->log_mutex); + + if (c->ro_media) { + err = -EROFS; + goto out_unlock; + } + + /* Make sure we have enough space in the log */ + if (empty_log_bytes(c) - c->ref_node_alsz < c->min_log_bytes) { + dbg_log("not enough log space - %lld, required %d", + empty_log_bytes(c), c->min_log_bytes); + ubifs_commit_required(c); + err = -EAGAIN; + goto out_unlock; + } + + /* + * Make sure the amount of space in buds will not exceed the + * 'c->max_bud_bytes' limit, because we want to guarantee mount time + * limits. + * + * It is not necessary to hold @c->buds_lock when reading @c->bud_bytes + * because we are holding @c->log_mutex. All @c->bud_bytes take place + * when both @c->log_mutex and @c->bud_bytes are locked. + */ + if (c->bud_bytes + c->leb_size - offs > c->max_bud_bytes) { + dbg_log("bud bytes %lld (%lld max), require commit", + c->bud_bytes, c->max_bud_bytes); + ubifs_commit_required(c); + err = -EAGAIN; + goto out_unlock; + } + + /* + * If the journal is full enough - start background commit. Note, it is + * OK to read 'c->cmt_state' without spinlock because integer reads + * are atomic in the kernel. + */ + if (c->bud_bytes >= c->bg_bud_bytes && + c->cmt_state == COMMIT_RESTING) { + dbg_log("bud bytes %lld (%lld max), initiate BG commit", + c->bud_bytes, c->max_bud_bytes); + ubifs_request_bg_commit(c); + } + + bud->lnum = lnum; + bud->start = offs; + bud->jhead = jhead; + + ref->ch.node_type = UBIFS_REF_NODE; + ref->lnum = cpu_to_le32(bud->lnum); + ref->offs = cpu_to_le32(bud->start); + ref->jhead = cpu_to_le32(jhead); + + if (c->lhead_offs > c->leb_size - c->ref_node_alsz) { + c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); + c->lhead_offs = 0; + } + + if (c->lhead_offs == 0) { + /* Must ensure next log LEB has been unmapped */ + err = ubifs_leb_unmap(c, c->lhead_lnum); + if (err) + goto out_unlock; + } + + if (bud->start == 0) { + /* + * Before writing the LEB reference which refers an empty LEB + * to the log, we have to make sure it is mapped, because + * otherwise we'd risk to refer an LEB with garbage in case of + * an unclean reboot, because the target LEB might have been + * unmapped, but not yet physically erased. + */ + err = ubi_leb_map(c->ubi, bud->lnum, UBI_SHORTTERM); + if (err) + goto out_unlock; + } + + dbg_log("write ref LEB %d:%d", + c->lhead_lnum, c->lhead_offs); + err = ubifs_write_node(c, ref, UBIFS_REF_NODE_SZ, c->lhead_lnum, + c->lhead_offs, UBI_SHORTTERM); + if (err) + goto out_unlock; + + c->lhead_offs += c->ref_node_alsz; + + ubifs_add_bud(c, bud); + + mutex_unlock(&c->log_mutex); + kfree(ref); + return 0; + +out_unlock: + if (err != -EAGAIN) + ubifs_ro_mode(c, err); + mutex_unlock(&c->log_mutex); + kfree(ref); + kfree(bud); + return err; +} + +/** + * remove_buds - remove used buds. + * @c: UBIFS file-system description object + * + * This function removes use buds from the buds tree. It does not remove the + * buds which are pointed to by journal heads. + */ +static void remove_buds(struct ubifs_info *c) +{ + struct rb_node *p; + + ubifs_assert(list_empty(&c->old_buds)); + c->cmt_bud_bytes = 0; + spin_lock(&c->buds_lock); + p = rb_first(&c->buds); + while (p) { + struct rb_node *p1 = p; + struct ubifs_bud *bud; + struct ubifs_wbuf *wbuf; + + p = rb_next(p); + bud = rb_entry(p1, struct ubifs_bud, rb); + wbuf = &c->jheads[bud->jhead].wbuf; + + if (wbuf->lnum == bud->lnum) { + /* + * Do not remove buds which are pointed to by journal + * heads (non-closed buds). + */ + c->cmt_bud_bytes += wbuf->offs - bud->start; + dbg_log("preserve %d:%d, jhead %d, bud bytes %d, " + "cmt_bud_bytes %lld", bud->lnum, bud->start, + bud->jhead, wbuf->offs - bud->start, + c->cmt_bud_bytes); + bud->start = wbuf->offs; + } else { + c->cmt_bud_bytes += c->leb_size - bud->start; + dbg_log("remove %d:%d, jhead %d, bud bytes %d, " + "cmt_bud_bytes %lld", bud->lnum, bud->start, + bud->jhead, c->leb_size - bud->start, + c->cmt_bud_bytes); + rb_erase(p1, &c->buds); + /* + * If the commit does not finish, the recovery will need + * to replay the journal, in which case the old buds + * must be unchanged. Do not release them until post + * commit i.e. do not allow them to be garbage + * collected. + */ + list_move(&bud->list, &c->old_buds); + } + } + spin_unlock(&c->buds_lock); +} + +/** + * ubifs_log_start_commit - start commit. + * @c: UBIFS file-system description object + * @ltail_lnum: return new log tail LEB number + * + * The commit operation starts with writing "commit start" node to the log and + * reference nodes for all journal heads which will define new journal after + * the commit has been finished. The commit start and reference nodes are + * written in one go to the nearest empty log LEB (hence, when commit is + * finished UBIFS may safely unmap all the previous log LEBs). This function + * returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum) +{ + void *buf; + struct ubifs_cs_node *cs; + struct ubifs_ref_node *ref; + int err, i, max_len, len; + + err = dbg_check_bud_bytes(c); + if (err) + return err; + + max_len = UBIFS_CS_NODE_SZ + c->jhead_cnt * UBIFS_REF_NODE_SZ; + max_len = ALIGN(max_len, c->min_io_size); + buf = cs = kmalloc(max_len, GFP_NOFS); + if (!buf) + return -ENOMEM; + + cs->ch.node_type = UBIFS_CS_NODE; + cs->cmt_no = cpu_to_le64(c->cmt_no); + ubifs_prepare_node(c, cs, UBIFS_CS_NODE_SZ, 0); + + /* + * Note, we do not lock 'c->log_mutex' because this is the commit start + * phase and we are exclusively using the log. And we do not lock + * write-buffer because nobody can write to the file-system at this + * phase. + */ + + len = UBIFS_CS_NODE_SZ; + for (i = 0; i < c->jhead_cnt; i++) { + int lnum = c->jheads[i].wbuf.lnum; + int offs = c->jheads[i].wbuf.offs; + + if (lnum == -1 || offs == c->leb_size) + continue; + + dbg_log("add ref to LEB %d:%d for jhead %d", lnum, offs, i); + ref = buf + len; + ref->ch.node_type = UBIFS_REF_NODE; + ref->lnum = cpu_to_le32(lnum); + ref->offs = cpu_to_le32(offs); + ref->jhead = cpu_to_le32(i); + + ubifs_prepare_node(c, ref, UBIFS_REF_NODE_SZ, 0); + len += UBIFS_REF_NODE_SZ; + } + + ubifs_pad(c, buf + len, ALIGN(len, c->min_io_size) - len); + + /* Switch to the next log LEB */ + if (c->lhead_offs) { + c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); + c->lhead_offs = 0; + } + + if (c->lhead_offs == 0) { + /* Must ensure next LEB has been unmapped */ + err = ubifs_leb_unmap(c, c->lhead_lnum); + if (err) + goto out; + } + + len = ALIGN(len, c->min_io_size); + dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len); + err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len, UBI_SHORTTERM); + if (err) + goto out; + + *ltail_lnum = c->lhead_lnum; + + c->lhead_offs += len; + if (c->lhead_offs == c->leb_size) { + c->lhead_lnum = next_log_lnum(c, c->lhead_lnum); + c->lhead_offs = 0; + } + + remove_buds(c); + + /* + * We have started the commit and now users may use the rest of the log + * for new writes. + */ + c->min_log_bytes = 0; + +out: + kfree(buf); + return err; +} + +/** + * ubifs_log_end_commit - end commit. + * @c: UBIFS file-system description object + * @ltail_lnum: new log tail LEB number + * + * This function is called on when the commit operation was finished. It + * moves log tail to new position and unmaps LEBs which contain obsolete data. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum) +{ + int err; + + /* + * At this phase we have to lock 'c->log_mutex' because UBIFS allows FS + * writes during commit. Its only short "commit" start phase when + * writers are blocked. + */ + mutex_lock(&c->log_mutex); + + dbg_log("old tail was LEB %d:0, new tail is LEB %d:0", + c->ltail_lnum, ltail_lnum); + + c->ltail_lnum = ltail_lnum; + /* + * The commit is finished and from now on it must be guaranteed that + * there is always enough space for the next commit. + */ + c->min_log_bytes = c->leb_size; + + spin_lock(&c->buds_lock); + c->bud_bytes -= c->cmt_bud_bytes; + spin_unlock(&c->buds_lock); + + err = dbg_check_bud_bytes(c); + + mutex_unlock(&c->log_mutex); + return err; +} + +/** + * ubifs_log_post_commit - things to do after commit is completed. + * @c: UBIFS file-system description object + * @old_ltail_lnum: old log tail LEB number + * + * Release buds only after commit is completed, because they must be unchanged + * if recovery is needed. + * + * Unmap log LEBs only after commit is completed, because they may be needed for + * recovery. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum) +{ + int lnum, err = 0; + + while (!list_empty(&c->old_buds)) { + struct ubifs_bud *bud; + + bud = list_entry(c->old_buds.next, struct ubifs_bud, list); + err = ubifs_return_leb(c, bud->lnum); + if (err) + return err; + list_del(&bud->list); + kfree(bud); + } + mutex_lock(&c->log_mutex); + for (lnum = old_ltail_lnum; lnum != c->ltail_lnum; + lnum = next_log_lnum(c, lnum)) { + dbg_log("unmap log LEB %d", lnum); + err = ubifs_leb_unmap(c, lnum); + if (err) + goto out; + } +out: + mutex_unlock(&c->log_mutex); + return err; +} + +/** + * struct done_ref - references that have been done. + * @rb: rb-tree node + * @lnum: LEB number + */ +struct done_ref { + struct rb_node rb; + int lnum; +}; + +/** + * done_already - determine if a reference has been done already. + * @done_tree: rb-tree to store references that have been done + * @lnum: LEB number of reference + * + * This function returns %1 if the reference has been done, %0 if not, otherwise + * a negative error code is returned. + */ +static int done_already(struct rb_root *done_tree, int lnum) +{ + struct rb_node **p = &done_tree->rb_node, *parent = NULL; + struct done_ref *dr; + + while (*p) { + parent = *p; + dr = rb_entry(parent, struct done_ref, rb); + if (lnum < dr->lnum) + p = &(*p)->rb_left; + else if (lnum > dr->lnum) + p = &(*p)->rb_right; + else + return 1; + } + + dr = kzalloc(sizeof(struct done_ref), GFP_NOFS); + if (!dr) + return -ENOMEM; + + dr->lnum = lnum; + + rb_link_node(&dr->rb, parent, p); + rb_insert_color(&dr->rb, done_tree); + + return 0; +} + +/** + * destroy_done_tree - destroy the done tree. + * @done_tree: done tree to destroy + */ +static void destroy_done_tree(struct rb_root *done_tree) +{ + struct rb_node *this = done_tree->rb_node; + struct done_ref *dr; + + while (this) { + if (this->rb_left) { + this = this->rb_left; + continue; + } else if (this->rb_right) { + this = this->rb_right; + continue; + } + dr = rb_entry(this, struct done_ref, rb); + this = rb_parent(this); + if (this) { + if (this->rb_left == &dr->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + kfree(dr); + } +} + +/** + * add_node - add a node to the consolidated log. + * @c: UBIFS file-system description object + * @buf: buffer to which to add + * @lnum: LEB number to which to write is passed and returned here + * @offs: offset to where to write is passed and returned here + * @node: node to add + * + * This function returns %0 on success and a negative error code on failure. + */ +static int add_node(struct ubifs_info *c, void *buf, int *lnum, int *offs, + void *node) +{ + struct ubifs_ch *ch = node; + int len = le32_to_cpu(ch->len), remains = c->leb_size - *offs; + + if (len > remains) { + int sz = ALIGN(*offs, c->min_io_size), err; + + ubifs_pad(c, buf + *offs, sz - *offs); + err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM); + if (err) + return err; + *lnum = next_log_lnum(c, *lnum); + *offs = 0; + } + memcpy(buf + *offs, node, len); + *offs += ALIGN(len, 8); + return 0; +} + +/** + * ubifs_consolidate_log - consolidate the log. + * @c: UBIFS file-system description object + * + * Repeated failed commits could cause the log to be full, but at least 1 LEB is + * needed for commit. This function rewrites the reference nodes in the log + * omitting duplicates, and failed CS nodes, and leaving no gaps. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_consolidate_log(struct ubifs_info *c) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + struct rb_root done_tree = RB_ROOT; + int lnum, err, first = 1, write_lnum, offs = 0; + void *buf; + + dbg_rcvry("log tail LEB %d, log head LEB %d", c->ltail_lnum, + c->lhead_lnum); + buf = vmalloc(c->leb_size); + if (!buf) + return -ENOMEM; + lnum = c->ltail_lnum; + write_lnum = lnum; + while (1) { + sleb = ubifs_scan(c, lnum, 0, c->sbuf); + if (IS_ERR(sleb)) { + err = PTR_ERR(sleb); + goto out_free; + } + list_for_each_entry(snod, &sleb->nodes, list) { + switch (snod->type) { + case UBIFS_REF_NODE: { + struct ubifs_ref_node *ref = snod->node; + int ref_lnum = le32_to_cpu(ref->lnum); + + err = done_already(&done_tree, ref_lnum); + if (err < 0) + goto out_scan; + if (err != 1) { + err = add_node(c, buf, &write_lnum, + &offs, snod->node); + if (err) + goto out_scan; + } + break; + } + case UBIFS_CS_NODE: + if (!first) + break; + err = add_node(c, buf, &write_lnum, &offs, + snod->node); + if (err) + goto out_scan; + first = 0; + break; + } + } + ubifs_scan_destroy(sleb); + if (lnum == c->lhead_lnum) + break; + lnum = next_log_lnum(c, lnum); + } + if (offs) { + int sz = ALIGN(offs, c->min_io_size); + + ubifs_pad(c, buf + offs, sz - offs); + err = ubifs_leb_change(c, write_lnum, buf, sz, UBI_SHORTTERM); + if (err) + goto out_free; + offs = ALIGN(offs, c->min_io_size); + } + destroy_done_tree(&done_tree); + vfree(buf); + if (write_lnum == c->lhead_lnum) { + ubifs_err("log is too full"); + return -EINVAL; + } + /* Unmap remaining LEBs */ + lnum = write_lnum; + do { + lnum = next_log_lnum(c, lnum); + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } while (lnum != c->lhead_lnum); + c->lhead_lnum = write_lnum; + c->lhead_offs = offs; + dbg_rcvry("new log head at %d:%d", c->lhead_lnum, c->lhead_offs); + return 0; + +out_scan: + ubifs_scan_destroy(sleb); +out_free: + destroy_done_tree(&done_tree); + vfree(buf); + return err; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +/** + * dbg_check_bud_bytes - make sure bud bytes calculation are all right. + * @c: UBIFS file-system description object + * + * This function makes sure the amount of flash space used by closed buds + * ('c->bud_bytes' is correct). Returns zero in case of success and %-EINVAL in + * case of failure. + */ +static int dbg_check_bud_bytes(struct ubifs_info *c) +{ + int i, err = 0; + struct ubifs_bud *bud; + long long bud_bytes = 0; + + if (!(ubifs_chk_flags & UBIFS_CHK_GEN)) + return 0; + + spin_lock(&c->buds_lock); + for (i = 0; i < c->jhead_cnt; i++) + list_for_each_entry(bud, &c->jheads[i].buds_list, list) + bud_bytes += c->leb_size - bud->start; + + if (c->bud_bytes != bud_bytes) { + ubifs_err("bad bud_bytes %lld, calculated %lld", + c->bud_bytes, bud_bytes); + err = -EINVAL; + } + spin_unlock(&c->buds_lock); + + return err; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff -Nurd linux-2.6.24/fs/ubifs/lprops.c ubifs-v2.6.24/fs/ubifs/lprops.c --- linux-2.6.24/fs/ubifs/lprops.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/lprops.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,1333 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements the functions that access LEB properties and their + * categories. LEBs are categorized based on the needs of UBIFS, and the + * categories are stored as either heaps or lists to provide a fast way of + * finding a LEB in a particular category. For example, UBIFS may need to find + * an empty LEB for the journal, or a very dirty LEB for garbage collection. + */ + +#include "ubifs.h" + +/** + * get_heap_comp_val - get the LEB properties value for heap comparisons. + * @lprops: LEB properties + * @cat: LEB category + */ +static int get_heap_comp_val(struct ubifs_lprops *lprops, int cat) +{ + switch (cat) { + case LPROPS_FREE: + return lprops->free; + case LPROPS_DIRTY_IDX: + return lprops->free + lprops->dirty; + default: + return lprops->dirty; + } +} + +/** + * move_up_lpt_heap - move a new heap entry up as far as possible. + * @c: UBIFS file-system description object + * @heap: LEB category heap + * @lprops: LEB properties to move + * @cat: LEB category + * + * New entries to a heap are added at the bottom and then moved up until the + * parent's value is greater. In the case of LPT's category heaps, the value + * is either the amount of free space or the amount of dirty space, depending + * on the category. + */ +static void move_up_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, + struct ubifs_lprops *lprops, int cat) +{ + int val1, val2, hpos; + + hpos = lprops->hpos; + if (!hpos) + return; /* Already top of the heap */ + val1 = get_heap_comp_val(lprops, cat); + /* Compare to parent and, if greater, move up the heap */ + do { + int ppos = (hpos - 1) / 2; + + val2 = get_heap_comp_val(heap->arr[ppos], cat); + if (val2 >= val1) + return; + /* Greater than parent so move up */ + heap->arr[ppos]->hpos = hpos; + heap->arr[hpos] = heap->arr[ppos]; + heap->arr[ppos] = lprops; + lprops->hpos = ppos; + hpos = ppos; + } while (hpos); +} + +/** + * adjust_lpt_heap - move a changed heap entry up or down the heap. + * @c: UBIFS file-system description object + * @heap: LEB category heap + * @lprops: LEB properties to move + * @hpos: heap position of @lprops + * @cat: LEB category + * + * Changed entries in a heap are moved up or down until the parent's value is + * greater. In the case of LPT's category heaps, the value is either the amount + * of free space or the amount of dirty space, depending on the category. + */ +static void adjust_lpt_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, + struct ubifs_lprops *lprops, int hpos, int cat) +{ + int val1, val2, val3, cpos; + + val1 = get_heap_comp_val(lprops, cat); + /* Compare to parent and, if greater than parent, move up the heap */ + if (hpos) { + int ppos = (hpos - 1) / 2; + + val2 = get_heap_comp_val(heap->arr[ppos], cat); + if (val1 > val2) { + /* Greater than parent so move up */ + while (1) { + heap->arr[ppos]->hpos = hpos; + heap->arr[hpos] = heap->arr[ppos]; + heap->arr[ppos] = lprops; + lprops->hpos = ppos; + hpos = ppos; + if (!hpos) + return; + ppos = (hpos - 1) / 2; + val2 = get_heap_comp_val(heap->arr[ppos], cat); + if (val1 <= val2) + return; + /* Still greater than parent so keep going */ + } + } + } + + /* Not greater than parent, so compare to children */ + while (1) { + /* Compare to left child */ + cpos = hpos * 2 + 1; + if (cpos >= heap->cnt) + return; + val2 = get_heap_comp_val(heap->arr[cpos], cat); + if (val1 < val2) { + /* Less than left child, so promote biggest child */ + if (cpos + 1 < heap->cnt) { + val3 = get_heap_comp_val(heap->arr[cpos + 1], + cat); + if (val3 > val2) + cpos += 1; /* Right child is bigger */ + } + heap->arr[cpos]->hpos = hpos; + heap->arr[hpos] = heap->arr[cpos]; + heap->arr[cpos] = lprops; + lprops->hpos = cpos; + hpos = cpos; + continue; + } + /* Compare to right child */ + cpos += 1; + if (cpos >= heap->cnt) + return; + val3 = get_heap_comp_val(heap->arr[cpos], cat); + if (val1 < val3) { + /* Less than right child, so promote right child */ + heap->arr[cpos]->hpos = hpos; + heap->arr[hpos] = heap->arr[cpos]; + heap->arr[cpos] = lprops; + lprops->hpos = cpos; + hpos = cpos; + continue; + } + return; + } +} + +/** + * add_to_lpt_heap - add LEB properties to a LEB category heap. + * @c: UBIFS file-system description object + * @lprops: LEB properties to add + * @cat: LEB category + * + * This function returns %1 if @lprops is added to the heap for LEB category + * @cat, otherwise %0 is returned because the heap is full. + */ +static int add_to_lpt_heap(struct ubifs_info *c, struct ubifs_lprops *lprops, + int cat) +{ + struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1]; + + if (heap->cnt >= heap->max_cnt) { + const int b = LPT_HEAP_SZ / 2 - 1; + int cpos, val1, val2; + + /* Compare to some other LEB on the bottom of heap */ + /* Pick a position kind of randomly */ + cpos = (((size_t)lprops >> 4) & b) + b; + ubifs_assert(cpos >= b); + ubifs_assert(cpos < LPT_HEAP_SZ); + ubifs_assert(cpos < heap->cnt); + + val1 = get_heap_comp_val(lprops, cat); + val2 = get_heap_comp_val(heap->arr[cpos], cat); + if (val1 > val2) { + struct ubifs_lprops *lp; + + lp = heap->arr[cpos]; + lp->flags &= ~LPROPS_CAT_MASK; + lp->flags |= LPROPS_UNCAT; + list_add(&lp->list, &c->uncat_list); + lprops->hpos = cpos; + heap->arr[cpos] = lprops; + move_up_lpt_heap(c, heap, lprops, cat); + dbg_check_heap(c, heap, cat, lprops->hpos); + return 1; /* Added to heap */ + } + dbg_check_heap(c, heap, cat, -1); + return 0; /* Not added to heap */ + } else { + lprops->hpos = heap->cnt++; + heap->arr[lprops->hpos] = lprops; + move_up_lpt_heap(c, heap, lprops, cat); + dbg_check_heap(c, heap, cat, lprops->hpos); + return 1; /* Added to heap */ + } +} + +/** + * remove_from_lpt_heap - remove LEB properties from a LEB category heap. + * @c: UBIFS file-system description object + * @lprops: LEB properties to remove + * @cat: LEB category + */ +static void remove_from_lpt_heap(struct ubifs_info *c, + struct ubifs_lprops *lprops, int cat) +{ + struct ubifs_lpt_heap *heap; + int hpos = lprops->hpos; + + heap = &c->lpt_heap[cat - 1]; + ubifs_assert(hpos >= 0 && hpos < heap->cnt); + ubifs_assert(heap->arr[hpos] == lprops); + heap->cnt -= 1; + if (hpos < heap->cnt) { + heap->arr[hpos] = heap->arr[heap->cnt]; + heap->arr[hpos]->hpos = hpos; + adjust_lpt_heap(c, heap, heap->arr[hpos], hpos, cat); + } + dbg_check_heap(c, heap, cat, -1); +} + +/** + * lpt_heap_replace - replace lprops in a category heap. + * @c: UBIFS file-system description object + * @old_lprops: LEB properties to replace + * @new_lprops: LEB properties with which to replace + * @cat: LEB category + * + * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode) + * and the lprops that the pnode contains. When that happens, references in + * the category heaps to those lprops must be updated to point to the new + * lprops. This function does that. + */ +static void lpt_heap_replace(struct ubifs_info *c, + struct ubifs_lprops *old_lprops, + struct ubifs_lprops *new_lprops, int cat) +{ + struct ubifs_lpt_heap *heap; + int hpos = new_lprops->hpos; + + heap = &c->lpt_heap[cat - 1]; + heap->arr[hpos] = new_lprops; +} + +/** + * ubifs_add_to_cat - add LEB properties to a category list or heap. + * @c: UBIFS file-system description object + * @lprops: LEB properties to add + * @cat: LEB category to which to add + * + * LEB properties are categorized to enable fast find operations. + */ +void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, + int cat) +{ + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + if (add_to_lpt_heap(c, lprops, cat)) + break; + /* No more room on heap so make it uncategorized */ + cat = LPROPS_UNCAT; + /* Fall through */ + case LPROPS_UNCAT: + list_add(&lprops->list, &c->uncat_list); + break; + case LPROPS_EMPTY: + list_add(&lprops->list, &c->empty_list); + break; + case LPROPS_FREEABLE: + list_add(&lprops->list, &c->freeable_list); + c->freeable_cnt += 1; + break; + case LPROPS_FRDI_IDX: + list_add(&lprops->list, &c->frdi_idx_list); + break; + default: + ubifs_assert(0); + } + lprops->flags &= ~LPROPS_CAT_MASK; + lprops->flags |= cat; +} + +/** + * ubifs_remove_from_cat - remove LEB properties from a category list or heap. + * @c: UBIFS file-system description object + * @lprops: LEB properties to remove + * @cat: LEB category from which to remove + * + * LEB properties are categorized to enable fast find operations. + */ +static void ubifs_remove_from_cat(struct ubifs_info *c, + struct ubifs_lprops *lprops, int cat) +{ + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + remove_from_lpt_heap(c, lprops, cat); + break; + case LPROPS_FREEABLE: + c->freeable_cnt -= 1; + ubifs_assert(c->freeable_cnt >= 0); + /* Fall through */ + case LPROPS_UNCAT: + case LPROPS_EMPTY: + case LPROPS_FRDI_IDX: + ubifs_assert(!list_empty(&lprops->list)); + list_del(&lprops->list); + break; + default: + ubifs_assert(0); + } +} + +/** + * ubifs_replace_cat - replace lprops in a category list or heap. + * @c: UBIFS file-system description object + * @old_lprops: LEB properties to replace + * @new_lprops: LEB properties with which to replace + * + * During commit it is sometimes necessary to copy a pnode (see dirty_cow_pnode) + * and the lprops that the pnode contains. When that happens, references in + * category lists and heaps must be replaced. This function does that. + */ +void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops, + struct ubifs_lprops *new_lprops) +{ + int cat; + + cat = new_lprops->flags & LPROPS_CAT_MASK; + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + lpt_heap_replace(c, old_lprops, new_lprops, cat); + break; + case LPROPS_UNCAT: + case LPROPS_EMPTY: + case LPROPS_FREEABLE: + case LPROPS_FRDI_IDX: + list_replace(&old_lprops->list, &new_lprops->list); + break; + default: + ubifs_assert(0); + } +} + +/** + * ubifs_ensure_cat - ensure LEB properties are categorized. + * @c: UBIFS file-system description object + * @lprops: LEB properties + * + * A LEB may have fallen off of the bottom of a heap, and ended up as + * uncategorized even though it has enough space for us now. If that is the case + * this function will put the LEB back onto a heap. + */ +void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops) +{ + int cat = lprops->flags & LPROPS_CAT_MASK; + + if (cat != LPROPS_UNCAT) + return; + cat = ubifs_categorize_lprops(c, lprops); + if (cat == LPROPS_UNCAT) + return; + ubifs_remove_from_cat(c, lprops, LPROPS_UNCAT); + ubifs_add_to_cat(c, lprops, cat); +} + +/** + * ubifs_categorize_lprops - categorize LEB properties. + * @c: UBIFS file-system description object + * @lprops: LEB properties to categorize + * + * LEB properties are categorized to enable fast find operations. This function + * returns the LEB category to which the LEB properties belong. Note however + * that if the LEB category is stored as a heap and the heap is full, the + * LEB properties may have their category changed to %LPROPS_UNCAT. + */ +int ubifs_categorize_lprops(const struct ubifs_info *c, + const struct ubifs_lprops *lprops) +{ + if (lprops->flags & LPROPS_TAKEN) + return LPROPS_UNCAT; + + if (lprops->free == c->leb_size) { + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + return LPROPS_EMPTY; + } + + if (lprops->free + lprops->dirty == c->leb_size) { + if (lprops->flags & LPROPS_INDEX) + return LPROPS_FRDI_IDX; + else + return LPROPS_FREEABLE; + } + + if (lprops->flags & LPROPS_INDEX) { + if (lprops->dirty + lprops->free >= c->min_idx_node_sz) + return LPROPS_DIRTY_IDX; + } else { + if (lprops->dirty >= c->dead_wm && + lprops->dirty > lprops->free) + return LPROPS_DIRTY; + if (lprops->free > 0) + return LPROPS_FREE; + } + + return LPROPS_UNCAT; +} + +/** + * change_category - change LEB properties category. + * @c: UBIFS file-system description object + * @lprops: LEB properties to recategorize + * + * LEB properties are categorized to enable fast find operations. When the LEB + * properties change they must be recategorized. + */ +static void change_category(struct ubifs_info *c, struct ubifs_lprops *lprops) +{ + int old_cat = lprops->flags & LPROPS_CAT_MASK; + int new_cat = ubifs_categorize_lprops(c, lprops); + + if (old_cat == new_cat) { + struct ubifs_lpt_heap *heap = &c->lpt_heap[new_cat - 1]; + + /* lprops on a heap now must be moved up or down */ + if (new_cat < 1 || new_cat > LPROPS_HEAP_CNT) + return; /* Not on a heap */ + heap = &c->lpt_heap[new_cat - 1]; + adjust_lpt_heap(c, heap, lprops, lprops->hpos, new_cat); + } else { + ubifs_remove_from_cat(c, lprops, old_cat); + ubifs_add_to_cat(c, lprops, new_cat); + } +} + +/** + * calc_dark - calculate LEB dark space size. + * @c: the UBIFS file-system description object + * @spc: amount of free and dirty space in the LEB + * + * This function calculates amount of dark space in an LEB which has @spc bytes + * of free and dirty space. Returns the calculations result. + * + * Dark space is the space which is not always usable - it depends on which + * nodes are written in which order. E.g., if an LEB has only 512 free bytes, + * it is dark space, because it cannot fit a large data node. So UBIFS cannot + * count on this LEB and treat these 512 bytes as usable because it is not true + * if, for example, only big chunks of uncompressible data will be written to + * the FS. + */ +static int calc_dark(struct ubifs_info *c, int spc) +{ + ubifs_assert(!(spc & 7)); + + if (spc < c->dark_wm) + return spc; + + /* + * If we have slightly more space then the dark space watermark, we can + * anyway safely assume it we'll be able to write a node of the + * smallest size there. + */ + if (spc - c->dark_wm < MIN_WRITE_SZ) + return spc - MIN_WRITE_SZ; + + return c->dark_wm; +} + +/** + * is_lprops_dirty - determine if LEB properties are dirty. + * @c: the UBIFS file-system description object + * @lprops: LEB properties to test + */ +static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops) +{ + struct ubifs_pnode *pnode; + int pos; + + pos = (lprops->lnum - c->main_first) & (UBIFS_LPT_FANOUT - 1); + pnode = (struct ubifs_pnode *)container_of(lprops - pos, + struct ubifs_pnode, + lprops[0]); + return !test_bit(COW_ZNODE, &pnode->flags) && + test_bit(DIRTY_CNODE, &pnode->flags); +} + +/** + * ubifs_change_lp - change LEB properties. + * @c: the UBIFS file-system description object + * @lp: LEB properties to change + * @free: new free space amount + * @dirty: new dirty space amount + * @flags: new flags + * @idx_gc_cnt: change to the count of idx_gc list + * + * This function changes LEB properties (@free, @dirty or @flag). However, the + * property which has the %LPROPS_NC value is not changed. Returns a pointer to + * the updated LEB properties on success and a negative error code on failure. + * + * Note, the LEB properties may have had to be copied (due to COW) and + * consequently the pointer returned may not be the same as the pointer + * passed. + */ +const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, + const struct ubifs_lprops *lp, + int free, int dirty, int flags, + int idx_gc_cnt) +{ + /* + * This is the only function that is allowed to change lprops, so we + * discard the const qualifier. + */ + struct ubifs_lprops *lprops = (struct ubifs_lprops *)lp; + + dbg_lp("LEB %d, free %d, dirty %d, flags %d", + lprops->lnum, free, dirty, flags); + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + ubifs_assert(c->lst.empty_lebs >= 0 && + c->lst.empty_lebs <= c->main_lebs); + ubifs_assert(c->freeable_cnt >= 0); + ubifs_assert(c->freeable_cnt <= c->main_lebs); + ubifs_assert(c->lst.taken_empty_lebs >= 0); + ubifs_assert(c->lst.taken_empty_lebs <= c->lst.empty_lebs); + ubifs_assert(!(c->lst.total_free & 7) && !(c->lst.total_dirty & 7)); + ubifs_assert(!(c->lst.total_dead & 7) && !(c->lst.total_dark & 7)); + ubifs_assert(!(c->lst.total_used & 7)); + ubifs_assert(free == LPROPS_NC || free >= 0); + ubifs_assert(dirty == LPROPS_NC || dirty >= 0); + + if (!is_lprops_dirty(c, lprops)) { + lprops = ubifs_lpt_lookup_dirty(c, lprops->lnum); + if (IS_ERR(lprops)) + return lprops; + } else + ubifs_assert(lprops == ubifs_lpt_lookup_dirty(c, lprops->lnum)); + + ubifs_assert(!(lprops->free & 7) && !(lprops->dirty & 7)); + + spin_lock(&c->space_lock); + if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size) + c->lst.taken_empty_lebs -= 1; + + if (!(lprops->flags & LPROPS_INDEX)) { + int old_spc; + + old_spc = lprops->free + lprops->dirty; + if (old_spc < c->dead_wm) + c->lst.total_dead -= old_spc; + else + c->lst.total_dark -= calc_dark(c, old_spc); + + c->lst.total_used -= c->leb_size - old_spc; + } + + if (free != LPROPS_NC) { + free = ALIGN(free, 8); + c->lst.total_free += free - lprops->free; + + /* Increase or decrease empty LEBs counter if needed */ + if (free == c->leb_size) { + if (lprops->free != c->leb_size) + c->lst.empty_lebs += 1; + } else if (lprops->free == c->leb_size) + c->lst.empty_lebs -= 1; + lprops->free = free; + } + + if (dirty != LPROPS_NC) { + dirty = ALIGN(dirty, 8); + c->lst.total_dirty += dirty - lprops->dirty; + lprops->dirty = dirty; + } + + if (flags != LPROPS_NC) { + /* Take care about indexing LEBs counter if needed */ + if ((lprops->flags & LPROPS_INDEX)) { + if (!(flags & LPROPS_INDEX)) + c->lst.idx_lebs -= 1; + } else if (flags & LPROPS_INDEX) + c->lst.idx_lebs += 1; + lprops->flags = flags; + } + + if (!(lprops->flags & LPROPS_INDEX)) { + int new_spc; + + new_spc = lprops->free + lprops->dirty; + if (new_spc < c->dead_wm) + c->lst.total_dead += new_spc; + else + c->lst.total_dark += calc_dark(c, new_spc); + + c->lst.total_used += c->leb_size - new_spc; + } + + if ((lprops->flags & LPROPS_TAKEN) && lprops->free == c->leb_size) + c->lst.taken_empty_lebs += 1; + + change_category(c, lprops); + c->idx_gc_cnt += idx_gc_cnt; + spin_unlock(&c->space_lock); + return lprops; +} + +/** + * ubifs_get_lp_stats - get lprops statistics. + * @c: UBIFS file-system description object + * @st: return statistics + */ +void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst) +{ + spin_lock(&c->space_lock); + memcpy(lst, &c->lst, sizeof(struct ubifs_lp_stats)); + spin_unlock(&c->space_lock); +} + +/** + * ubifs_change_one_lp - change LEB properties. + * @c: the UBIFS file-system description object + * @lnum: LEB to change properties for + * @free: amount of free space + * @dirty: amount of dirty space + * @flags_set: flags to set + * @flags_clean: flags to clean + * @idx_gc_cnt: change to the count of idx_gc list + * + * This function changes properties of LEB @lnum. It is a helper wrapper over + * 'ubifs_change_lp()' which hides lprops get/release. The arguments are the + * same as in case of 'ubifs_change_lp()'. Returns zero in case of success and + * a negative error code in case of failure. + */ +int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, + int flags_set, int flags_clean, int idx_gc_cnt) +{ + int err = 0, flags; + const struct ubifs_lprops *lp; + + ubifs_get_lprops(c); + + lp = ubifs_lpt_lookup_dirty(c, lnum); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + flags = (lp->flags | flags_set) & ~flags_clean; + lp = ubifs_change_lp(c, lp, free, dirty, flags, idx_gc_cnt); + if (IS_ERR(lp)) + err = PTR_ERR(lp); + +out: + ubifs_release_lprops(c); + if (err) + ubifs_err("cannot change properties of LEB %d, error %d", + lnum, err); + return err; +} + +/** + * ubifs_update_one_lp - update LEB properties. + * @c: the UBIFS file-system description object + * @lnum: LEB to change properties for + * @free: amount of free space + * @dirty: amount of dirty space to add + * @flags_set: flags to set + * @flags_clean: flags to clean + * + * This function is the same as 'ubifs_change_one_lp()' but @dirty is added to + * current dirty space, not substitutes it. + */ +int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, + int flags_set, int flags_clean) +{ + int err = 0, flags; + const struct ubifs_lprops *lp; + + ubifs_get_lprops(c); + + lp = ubifs_lpt_lookup_dirty(c, lnum); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + flags = (lp->flags | flags_set) & ~flags_clean; + lp = ubifs_change_lp(c, lp, free, lp->dirty + dirty, flags, 0); + if (IS_ERR(lp)) + err = PTR_ERR(lp); + +out: + ubifs_release_lprops(c); + if (err) + ubifs_err("cannot update properties of LEB %d, error %d", + lnum, err); + return err; +} + +/** + * ubifs_read_one_lp - read LEB properties. + * @c: the UBIFS file-system description object + * @lnum: LEB to read properties for + * @lp: where to store read properties + * + * This helper function reads properties of a LEB @lnum and stores them in @lp. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp) +{ + int err = 0; + const struct ubifs_lprops *lpp; + + ubifs_get_lprops(c); + + lpp = ubifs_lpt_lookup(c, lnum); + if (IS_ERR(lpp)) { + err = PTR_ERR(lpp); + ubifs_err("cannot read properties of LEB %d, error %d", + lnum, err); + goto out; + } + + memcpy(lp, lpp, sizeof(struct ubifs_lprops)); + +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_fast_find_free - try to find a LEB with free space quickly. + * @c: the UBIFS file-system description object + * + * This function returns LEB properties for a LEB with free space or %NULL if + * the function is unable to find a LEB quickly. + */ +const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + + heap = &c->lpt_heap[LPROPS_FREE - 1]; + if (heap->cnt == 0) + return NULL; + + lprops = heap->arr[0]; + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + return lprops; +} + +/** + * ubifs_fast_find_empty - try to find an empty LEB quickly. + * @c: the UBIFS file-system description object + * + * This function returns LEB properties for an empty LEB or %NULL if the + * function is unable to find an empty LEB quickly. + */ +const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + + if (list_empty(&c->empty_list)) + return NULL; + + lprops = list_entry(c->empty_list.next, struct ubifs_lprops, list); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + ubifs_assert(lprops->free == c->leb_size); + return lprops; +} + +/** + * ubifs_fast_find_freeable - try to find a freeable LEB quickly. + * @c: the UBIFS file-system description object + * + * This function returns LEB properties for a freeable LEB or %NULL if the + * function is unable to find a freeable LEB quickly. + */ +const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + + if (list_empty(&c->freeable_list)) + return NULL; + + lprops = list_entry(c->freeable_list.next, struct ubifs_lprops, list); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert(!(lprops->flags & LPROPS_INDEX)); + ubifs_assert(lprops->free + lprops->dirty == c->leb_size); + ubifs_assert(c->freeable_cnt > 0); + return lprops; +} + +/** + * ubifs_fast_find_frdi_idx - try to find a freeable index LEB quickly. + * @c: the UBIFS file-system description object + * + * This function returns LEB properties for a freeable index LEB or %NULL if the + * function is unable to find a freeable index LEB quickly. + */ +const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + + if (list_empty(&c->frdi_idx_list)) + return NULL; + + lprops = list_entry(c->frdi_idx_list.next, struct ubifs_lprops, list); + ubifs_assert(!(lprops->flags & LPROPS_TAKEN)); + ubifs_assert((lprops->flags & LPROPS_INDEX)); + ubifs_assert(lprops->free + lprops->dirty == c->leb_size); + return lprops; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +/** + * dbg_check_cats - check category heaps and lists. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_check_cats(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct list_head *pos; + int i, cat; + + if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS))) + return 0; + + list_for_each_entry(lprops, &c->empty_list, list) { + if (lprops->free != c->leb_size) { + ubifs_err("non-empty LEB %d on empty list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + ubifs_err("taken LEB %d on empty list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + } + + i = 0; + list_for_each_entry(lprops, &c->freeable_list, list) { + if (lprops->free + lprops->dirty != c->leb_size) { + ubifs_err("non-freeable LEB %d on freeable list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + ubifs_err("taken LEB %d on freeable list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + i += 1; + } + if (i != c->freeable_cnt) { + ubifs_err("freeable list count %d expected %d", i, + c->freeable_cnt); + return -EINVAL; + } + + i = 0; + list_for_each(pos, &c->idx_gc) + i += 1; + if (i != c->idx_gc_cnt) { + ubifs_err("idx_gc list count %d expected %d", i, + c->idx_gc_cnt); + return -EINVAL; + } + + list_for_each_entry(lprops, &c->frdi_idx_list, list) { + if (lprops->free + lprops->dirty != c->leb_size) { + ubifs_err("non-freeable LEB %d on frdi_idx list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + ubifs_err("taken LEB %d on frdi_idx list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + if (!(lprops->flags & LPROPS_INDEX)) { + ubifs_err("non-index LEB %d on frdi_idx list " + "(free %d dirty %d flags %d)", lprops->lnum, + lprops->free, lprops->dirty, lprops->flags); + return -EINVAL; + } + } + + for (cat = 1; cat <= LPROPS_HEAP_CNT; cat++) { + struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1]; + + for (i = 0; i < heap->cnt; i++) { + lprops = heap->arr[i]; + if (!lprops) { + ubifs_err("null ptr in LPT heap cat %d", cat); + return -EINVAL; + } + if (lprops->hpos != i) { + ubifs_err("bad ptr in LPT heap cat %d", cat); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + ubifs_err("taken LEB in LPT heap cat %d", cat); + return -EINVAL; + } + } + } + + return 0; +} + +void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, + int add_pos) +{ + int i = 0, j, err = 0; + + if (!(ubifs_chk_flags & (UBIFS_CHK_GEN | UBIFS_CHK_LPROPS))) + return; + + for (i = 0; i < heap->cnt; i++) { + struct ubifs_lprops *lprops = heap->arr[i]; + struct ubifs_lprops *lp; + + if (i != add_pos) + if ((lprops->flags & LPROPS_CAT_MASK) != cat) { + err = 1; + goto out; + } + if (lprops->hpos != i) { + err = 2; + goto out; + } + lp = ubifs_lpt_lookup(c, lprops->lnum); + if (IS_ERR(lp)) { + err = 3; + goto out; + } + if (lprops != lp) { + dbg_msg("lprops %zx lp %zx lprops->lnum %d lp->lnum %d", + (size_t)lprops, (size_t)lp, lprops->lnum, + lp->lnum); + err = 4; + goto out; + } + for (j = 0; j < i; j++) { + lp = heap->arr[j]; + if (lp == lprops) { + err = 5; + goto out; + } + if (lp->lnum == lprops->lnum) { + err = 6; + goto out; + } + } + } +out: + if (err) { + dbg_msg("failed cat %d hpos %d err %d", cat, i, err); + dbg_dump_stack(); + dbg_dump_heap(c, heap, cat); + } +} + +/** + * struct scan_check_data - data provided to scan callback function. + * @lst: LEB properties statistics + * @err: error code + */ +struct scan_check_data { + struct ubifs_lp_stats lst; + int err; +}; + +/** + * scan_check_cb - scan callback. + * @c: the UBIFS file-system description object + * @lp: LEB properties to scan + * @in_tree: whether the LEB properties are in main memory + * @data: information passed to and from the caller of the scan + * + * This function returns a code that indicates whether the scan should continue + * (%LPT_SCAN_CONTINUE), whether the LEB properties should be added to the tree + * in main memory (%LPT_SCAN_ADD), or whether the scan should stop + * (%LPT_SCAN_STOP). + */ +static int scan_check_cb(struct ubifs_info *c, + const struct ubifs_lprops *lp, int in_tree, + struct scan_check_data *data) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + struct ubifs_lp_stats *lst = &data->lst; + int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty; + + cat = lp->flags & LPROPS_CAT_MASK; + if (cat != LPROPS_UNCAT) { + cat = ubifs_categorize_lprops(c, lp); + if (cat != (lp->flags & LPROPS_CAT_MASK)) { + ubifs_err("bad LEB category %d expected %d", + (lp->flags & LPROPS_CAT_MASK), cat); + goto out; + } + } + + /* Check lp is on its category list (if it has one) */ + if (in_tree) { + struct list_head *list = NULL; + + switch (cat) { + case LPROPS_EMPTY: + list = &c->empty_list; + break; + case LPROPS_FREEABLE: + list = &c->freeable_list; + break; + case LPROPS_FRDI_IDX: + list = &c->frdi_idx_list; + break; + case LPROPS_UNCAT: + list = &c->uncat_list; + break; + } + if (list) { + struct ubifs_lprops *lprops; + int found = 0; + + list_for_each_entry(lprops, list, list) { + if (lprops == lp) { + found = 1; + break; + } + } + if (!found) { + ubifs_err("bad LPT list (category %d)", cat); + goto out; + } + } + } + + /* Check lp is on its category heap (if it has one) */ + if (in_tree && cat > 0 && cat <= LPROPS_HEAP_CNT) { + struct ubifs_lpt_heap *heap = &c->lpt_heap[cat - 1]; + + if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) || + lp != heap->arr[lp->hpos]) { + ubifs_err("bad LPT heap (category %d)", cat); + goto out; + } + } + + sleb = ubifs_scan(c, lnum, 0, c->dbg->buf); + if (IS_ERR(sleb)) { + /* + * After an unclean unmount, empty and freeable LEBs + * may contain garbage. + */ + if (lp->free == c->leb_size) { + ubifs_err("scan errors were in empty LEB " + "- continuing checking"); + lst->empty_lebs += 1; + lst->total_free += c->leb_size; + lst->total_dark += calc_dark(c, c->leb_size); + return LPT_SCAN_CONTINUE; + } + + if (lp->free + lp->dirty == c->leb_size && + !(lp->flags & LPROPS_INDEX)) { + ubifs_err("scan errors were in freeable LEB " + "- continuing checking"); + lst->total_free += lp->free; + lst->total_dirty += lp->dirty; + lst->total_dark += calc_dark(c, c->leb_size); + return LPT_SCAN_CONTINUE; + } + data->err = PTR_ERR(sleb); + return LPT_SCAN_STOP; + } + + is_idx = -1; + list_for_each_entry(snod, &sleb->nodes, list) { + int found, level = 0; + + cond_resched(); + + if (is_idx == -1) + is_idx = (snod->type == UBIFS_IDX_NODE) ? 1 : 0; + + if (is_idx && snod->type != UBIFS_IDX_NODE) { + ubifs_err("indexing node in data LEB %d:%d", + lnum, snod->offs); + goto out_destroy; + } + + if (snod->type == UBIFS_IDX_NODE) { + struct ubifs_idx_node *idx = snod->node; + + key_read(c, ubifs_idx_key(c, idx), &snod->key); + level = le16_to_cpu(idx->level); + } + + found = ubifs_tnc_has_node(c, &snod->key, level, lnum, + snod->offs, is_idx); + if (found) { + if (found < 0) + goto out_destroy; + used += ALIGN(snod->len, 8); + } + } + + free = c->leb_size - sleb->endpt; + dirty = sleb->endpt - used; + + if (free > c->leb_size || free < 0 || dirty > c->leb_size || + dirty < 0) { + ubifs_err("bad calculated accounting for LEB %d: " + "free %d, dirty %d", lnum, free, dirty); + goto out_destroy; + } + + if (lp->free + lp->dirty == c->leb_size && + free + dirty == c->leb_size) + if ((is_idx && !(lp->flags & LPROPS_INDEX)) || + (!is_idx && free == c->leb_size) || + lp->free == c->leb_size) { + /* + * Empty or freeable LEBs could contain index + * nodes from an uncompleted commit due to an + * unclean unmount. Or they could be empty for + * the same reason. Or it may simply not have been + * unmapped. + */ + free = lp->free; + dirty = lp->dirty; + is_idx = 0; + } + + if (is_idx && lp->free + lp->dirty == free + dirty && + lnum != c->ihead_lnum) { + /* + * After an unclean unmount, an index LEB could have a different + * amount of free space than the value recorded by lprops. That + * is because the in-the-gaps method may use free space or + * create free space (as a side-effect of using ubi_leb_change + * and not writing the whole LEB). The incorrect free space + * value is not a problem because the index is only ever + * allocated empty LEBs, so there will never be an attempt to + * write to the free space at the end of an index LEB - except + * by the in-the-gaps method for which it is not a problem. + */ + free = lp->free; + dirty = lp->dirty; + } + + if (lp->free != free || lp->dirty != dirty) + goto out_print; + + if (is_idx && !(lp->flags & LPROPS_INDEX)) { + if (free == c->leb_size) + /* Free but not unmapped LEB, it's fine */ + is_idx = 0; + else { + ubifs_err("indexing node without indexing " + "flag"); + goto out_print; + } + } + + if (!is_idx && (lp->flags & LPROPS_INDEX)) { + ubifs_err("data node with indexing flag"); + goto out_print; + } + + if (free == c->leb_size) + lst->empty_lebs += 1; + + if (is_idx) + lst->idx_lebs += 1; + + if (!(lp->flags & LPROPS_INDEX)) + lst->total_used += c->leb_size - free - dirty; + lst->total_free += free; + lst->total_dirty += dirty; + + if (!(lp->flags & LPROPS_INDEX)) { + int spc = free + dirty; + + if (spc < c->dead_wm) + lst->total_dead += spc; + else + lst->total_dark += calc_dark(c, spc); + } + + ubifs_scan_destroy(sleb); + return LPT_SCAN_CONTINUE; + +out_print: + ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, " + "should be free %d, dirty %d", + lnum, lp->free, lp->dirty, lp->flags, free, dirty); + dbg_dump_leb(c, lnum); +out_destroy: + ubifs_scan_destroy(sleb); +out: + data->err = -EINVAL; + return LPT_SCAN_STOP; +} + +/** + * dbg_check_lprops - check all LEB properties. + * @c: UBIFS file-system description object + * + * This function checks all LEB properties and makes sure they are all correct. + * It returns zero if everything is fine, %-EINVAL if there is an inconsistency + * and other negative error codes in case of other errors. This function is + * called while the file system is locked (because of commit start), so no + * additional locking is required. Note that locking the LPT mutex would cause + * a circular lock dependency with the TNC mutex. + */ +int dbg_check_lprops(struct ubifs_info *c) +{ + int i, err; + struct scan_check_data data; + struct ubifs_lp_stats *lst = &data.lst; + + if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + return 0; + + /* + * As we are going to scan the media, the write buffers have to be + * synchronized. + */ + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + return err; + } + + memset(lst, 0, sizeof(struct ubifs_lp_stats)); + + data.err = 0; + err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1, + (ubifs_lpt_scan_callback)scan_check_cb, + &data); + if (err && err != -ENOSPC) + goto out; + if (data.err) { + err = data.err; + goto out; + } + + if (lst->empty_lebs != c->lst.empty_lebs || + lst->idx_lebs != c->lst.idx_lebs || + lst->total_free != c->lst.total_free || + lst->total_dirty != c->lst.total_dirty || + lst->total_used != c->lst.total_used) { + ubifs_err("bad overall accounting"); + ubifs_err("calculated: empty_lebs %d, idx_lebs %d, " + "total_free %lld, total_dirty %lld, total_used %lld", + lst->empty_lebs, lst->idx_lebs, lst->total_free, + lst->total_dirty, lst->total_used); + ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, " + "total_free %lld, total_dirty %lld, total_used %lld", + c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free, + c->lst.total_dirty, c->lst.total_used); + err = -EINVAL; + goto out; + } + + if (lst->total_dead != c->lst.total_dead || + lst->total_dark != c->lst.total_dark) { + ubifs_err("bad dead/dark space accounting"); + ubifs_err("calculated: total_dead %lld, total_dark %lld", + lst->total_dead, lst->total_dark); + ubifs_err("read from lprops: total_dead %lld, total_dark %lld", + c->lst.total_dead, c->lst.total_dark); + err = -EINVAL; + goto out; + } + + err = dbg_check_cats(c); +out: + return err; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff -Nurd linux-2.6.24/fs/ubifs/lpt.c ubifs-v2.6.24/fs/ubifs/lpt.c --- linux-2.6.24/fs/ubifs/lpt.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/lpt.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,2271 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements the LEB properties tree (LPT) area. The LPT area + * contains the LEB properties tree, a table of LPT area eraseblocks (ltab), and + * (for the "big" model) a table of saved LEB numbers (lsave). The LPT area sits + * between the log and the orphan area. + * + * The LPT area is like a miniature self-contained file system. It is required + * that it never runs out of space, is fast to access and update, and scales + * logarithmically. The LEB properties tree is implemented as a wandering tree + * much like the TNC, and the LPT area has its own garbage collection. + * + * The LPT has two slightly different forms called the "small model" and the + * "big model". The small model is used when the entire LEB properties table + * can be written into a single eraseblock. In that case, garbage collection + * consists of just writing the whole table, which therefore makes all other + * eraseblocks reusable. In the case of the big model, dirty eraseblocks are + * selected for garbage collection, which consists of marking the clean nodes in + * that LEB as dirty, and then only the dirty nodes are written out. Also, in + * the case of the big model, a table of LEB numbers is saved so that the entire + * LPT does not to be scanned looking for empty eraseblocks when UBIFS is first + * mounted. + */ + +#include "ubifs.h" +#include + +/** + * do_calc_lpt_geom - calculate sizes for the LPT area. + * @c: the UBIFS file-system description object + * + * Calculate the sizes of LPT bit fields, nodes, and tree, based on the + * properties of the flash and whether LPT is "big" (c->big_lpt). + */ +static void do_calc_lpt_geom(struct ubifs_info *c) +{ + int i, n, bits, per_leb_wastage, max_pnode_cnt; + long long sz, tot_wastage; + + n = c->main_lebs + c->max_leb_cnt - c->leb_cnt; + max_pnode_cnt = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT); + + c->lpt_hght = 1; + n = UBIFS_LPT_FANOUT; + while (n < max_pnode_cnt) { + c->lpt_hght += 1; + n <<= UBIFS_LPT_FANOUT_SHIFT; + } + + c->pnode_cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT); + + n = DIV_ROUND_UP(c->pnode_cnt, UBIFS_LPT_FANOUT); + c->nnode_cnt = n; + for (i = 1; i < c->lpt_hght; i++) { + n = DIV_ROUND_UP(n, UBIFS_LPT_FANOUT); + c->nnode_cnt += n; + } + + c->space_bits = fls(c->leb_size) - 3; + c->lpt_lnum_bits = fls(c->lpt_lebs); + c->lpt_offs_bits = fls(c->leb_size - 1); + c->lpt_spc_bits = fls(c->leb_size); + + n = DIV_ROUND_UP(c->max_leb_cnt, UBIFS_LPT_FANOUT); + c->pcnt_bits = fls(n - 1); + + c->lnum_bits = fls(c->max_leb_cnt - 1); + + bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS + + (c->big_lpt ? c->pcnt_bits : 0) + + (c->space_bits * 2 + 1) * UBIFS_LPT_FANOUT; + c->pnode_sz = (bits + 7) / 8; + + bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS + + (c->big_lpt ? c->pcnt_bits : 0) + + (c->lpt_lnum_bits + c->lpt_offs_bits) * UBIFS_LPT_FANOUT; + c->nnode_sz = (bits + 7) / 8; + + bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS + + c->lpt_lebs * c->lpt_spc_bits * 2; + c->ltab_sz = (bits + 7) / 8; + + bits = UBIFS_LPT_CRC_BITS + UBIFS_LPT_TYPE_BITS + + c->lnum_bits * c->lsave_cnt; + c->lsave_sz = (bits + 7) / 8; + + /* Calculate the minimum LPT size */ + c->lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; + c->lpt_sz += (long long)c->nnode_cnt * c->nnode_sz; + c->lpt_sz += c->ltab_sz; + if (c->big_lpt) + c->lpt_sz += c->lsave_sz; + + /* Add wastage */ + sz = c->lpt_sz; + per_leb_wastage = max_t(int, c->pnode_sz, c->nnode_sz); + sz += per_leb_wastage; + tot_wastage = per_leb_wastage; + while (sz > c->leb_size) { + sz += per_leb_wastage; + sz -= c->leb_size; + tot_wastage += per_leb_wastage; + } + tot_wastage += ALIGN(sz, c->min_io_size) - sz; + c->lpt_sz += tot_wastage; +} + +/** + * ubifs_calc_lpt_geom - calculate and check sizes for the LPT area. + * @c: the UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_calc_lpt_geom(struct ubifs_info *c) +{ + int lebs_needed; + long long sz; + + do_calc_lpt_geom(c); + + /* Verify that lpt_lebs is big enough */ + sz = c->lpt_sz * 2; /* Must have at least 2 times the size */ + lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size); + if (lebs_needed > c->lpt_lebs) { + ubifs_err("too few LPT LEBs"); + return -EINVAL; + } + + /* Verify that ltab fits in a single LEB (since ltab is a single node */ + if (c->ltab_sz > c->leb_size) { + ubifs_err("LPT ltab too big"); + return -EINVAL; + } + + c->check_lpt_free = c->big_lpt; + return 0; +} + +/** + * calc_dflt_lpt_geom - calculate default LPT geometry. + * @c: the UBIFS file-system description object + * @main_lebs: number of main area LEBs is passed and returned here + * @big_lpt: whether the LPT area is "big" is returned here + * + * The size of the LPT area depends on parameters that themselves are dependent + * on the size of the LPT area. This function, successively recalculates the LPT + * area geometry until the parameters and resultant geometry are consistent. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs, + int *big_lpt) +{ + int i, lebs_needed; + long long sz; + + /* Start by assuming the minimum number of LPT LEBs */ + c->lpt_lebs = UBIFS_MIN_LPT_LEBS; + c->main_lebs = *main_lebs - c->lpt_lebs; + if (c->main_lebs <= 0) + return -EINVAL; + + /* And assume we will use the small LPT model */ + c->big_lpt = 0; + + /* + * Calculate the geometry based on assumptions above and then see if it + * makes sense + */ + do_calc_lpt_geom(c); + + /* Small LPT model must have lpt_sz < leb_size */ + if (c->lpt_sz > c->leb_size) { + /* Nope, so try again using big LPT model */ + c->big_lpt = 1; + do_calc_lpt_geom(c); + } + + /* Now check there are enough LPT LEBs */ + for (i = 0; i < 64 ; i++) { + sz = c->lpt_sz * 4; /* Allow 4 times the size */ + lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size); + if (lebs_needed > c->lpt_lebs) { + /* Not enough LPT LEBs so try again with more */ + c->lpt_lebs = lebs_needed; + c->main_lebs = *main_lebs - c->lpt_lebs; + if (c->main_lebs <= 0) + return -EINVAL; + do_calc_lpt_geom(c); + continue; + } + if (c->ltab_sz > c->leb_size) { + ubifs_err("LPT ltab too big"); + return -EINVAL; + } + *main_lebs = c->main_lebs; + *big_lpt = c->big_lpt; + return 0; + } + return -EINVAL; +} + +/** + * pack_bits - pack bit fields end-to-end. + * @addr: address at which to pack (passed and next address returned) + * @pos: bit position at which to pack (passed and next position returned) + * @val: value to pack + * @nrbits: number of bits of value to pack (1-32) + */ +static void pack_bits(uint8_t **addr, int *pos, uint32_t val, int nrbits) +{ + uint8_t *p = *addr; + int b = *pos; + + ubifs_assert(nrbits > 0); + ubifs_assert(nrbits <= 32); + ubifs_assert(*pos >= 0); + ubifs_assert(*pos < 8); + ubifs_assert((val >> nrbits) == 0 || nrbits == 32); + if (b) { + *p |= ((uint8_t)val) << b; + nrbits += b; + if (nrbits > 8) { + *++p = (uint8_t)(val >>= (8 - b)); + if (nrbits > 16) { + *++p = (uint8_t)(val >>= 8); + if (nrbits > 24) { + *++p = (uint8_t)(val >>= 8); + if (nrbits > 32) + *++p = (uint8_t)(val >>= 8); + } + } + } + } else { + *p = (uint8_t)val; + if (nrbits > 8) { + *++p = (uint8_t)(val >>= 8); + if (nrbits > 16) { + *++p = (uint8_t)(val >>= 8); + if (nrbits > 24) + *++p = (uint8_t)(val >>= 8); + } + } + } + b = nrbits & 7; + if (b == 0) + p++; + *addr = p; + *pos = b; +} + +/** + * ubifs_unpack_bits - unpack bit fields. + * @addr: address at which to unpack (passed and next address returned) + * @pos: bit position at which to unpack (passed and next position returned) + * @nrbits: number of bits of value to unpack (1-32) + * + * This functions returns the value unpacked. + */ +uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits) +{ + const int k = 32 - nrbits; + uint8_t *p = *addr; + int b = *pos; + uint32_t uninitialized_var(val); + const int bytes = (nrbits + b + 7) >> 3; + + ubifs_assert(nrbits > 0); + ubifs_assert(nrbits <= 32); + ubifs_assert(*pos >= 0); + ubifs_assert(*pos < 8); + if (b) { + switch (bytes) { + case 2: + val = p[1]; + break; + case 3: + val = p[1] | ((uint32_t)p[2] << 8); + break; + case 4: + val = p[1] | ((uint32_t)p[2] << 8) | + ((uint32_t)p[3] << 16); + break; + case 5: + val = p[1] | ((uint32_t)p[2] << 8) | + ((uint32_t)p[3] << 16) | + ((uint32_t)p[4] << 24); + } + val <<= (8 - b); + val |= *p >> b; + nrbits += b; + } else { + switch (bytes) { + case 1: + val = p[0]; + break; + case 2: + val = p[0] | ((uint32_t)p[1] << 8); + break; + case 3: + val = p[0] | ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16); + break; + case 4: + val = p[0] | ((uint32_t)p[1] << 8) | + ((uint32_t)p[2] << 16) | + ((uint32_t)p[3] << 24); + break; + } + } + val <<= k; + val >>= k; + b = nrbits & 7; + p += nrbits >> 3; + *addr = p; + *pos = b; + ubifs_assert((val >> nrbits) == 0 || nrbits - b == 32); + return val; +} + +/** + * ubifs_pack_pnode - pack all the bit fields of a pnode. + * @c: UBIFS file-system description object + * @buf: buffer into which to pack + * @pnode: pnode to pack + */ +void ubifs_pack_pnode(struct ubifs_info *c, void *buf, + struct ubifs_pnode *pnode) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0; + uint16_t crc; + + pack_bits(&addr, &pos, UBIFS_LPT_PNODE, UBIFS_LPT_TYPE_BITS); + if (c->big_lpt) + pack_bits(&addr, &pos, pnode->num, c->pcnt_bits); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + pack_bits(&addr, &pos, pnode->lprops[i].free >> 3, + c->space_bits); + pack_bits(&addr, &pos, pnode->lprops[i].dirty >> 3, + c->space_bits); + if (pnode->lprops[i].flags & LPROPS_INDEX) + pack_bits(&addr, &pos, 1, 1); + else + pack_bits(&addr, &pos, 0, 1); + } + crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + c->pnode_sz - UBIFS_LPT_CRC_BYTES); + addr = buf; + pos = 0; + pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS); +} + +/** + * ubifs_pack_nnode - pack all the bit fields of a nnode. + * @c: UBIFS file-system description object + * @buf: buffer into which to pack + * @nnode: nnode to pack + */ +void ubifs_pack_nnode(struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0; + uint16_t crc; + + pack_bits(&addr, &pos, UBIFS_LPT_NNODE, UBIFS_LPT_TYPE_BITS); + if (c->big_lpt) + pack_bits(&addr, &pos, nnode->num, c->pcnt_bits); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int lnum = nnode->nbranch[i].lnum; + + if (lnum == 0) + lnum = c->lpt_last + 1; + pack_bits(&addr, &pos, lnum - c->lpt_first, c->lpt_lnum_bits); + pack_bits(&addr, &pos, nnode->nbranch[i].offs, + c->lpt_offs_bits); + } + crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + c->nnode_sz - UBIFS_LPT_CRC_BYTES); + addr = buf; + pos = 0; + pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS); +} + +/** + * ubifs_pack_ltab - pack the LPT's own lprops table. + * @c: UBIFS file-system description object + * @buf: buffer into which to pack + * @ltab: LPT's own lprops table to pack + */ +void ubifs_pack_ltab(struct ubifs_info *c, void *buf, + struct ubifs_lpt_lprops *ltab) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0; + uint16_t crc; + + pack_bits(&addr, &pos, UBIFS_LPT_LTAB, UBIFS_LPT_TYPE_BITS); + for (i = 0; i < c->lpt_lebs; i++) { + pack_bits(&addr, &pos, ltab[i].free, c->lpt_spc_bits); + pack_bits(&addr, &pos, ltab[i].dirty, c->lpt_spc_bits); + } + crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + c->ltab_sz - UBIFS_LPT_CRC_BYTES); + addr = buf; + pos = 0; + pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS); +} + +/** + * ubifs_pack_lsave - pack the LPT's save table. + * @c: UBIFS file-system description object + * @buf: buffer into which to pack + * @lsave: LPT's save table to pack + */ +void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0; + uint16_t crc; + + pack_bits(&addr, &pos, UBIFS_LPT_LSAVE, UBIFS_LPT_TYPE_BITS); + for (i = 0; i < c->lsave_cnt; i++) + pack_bits(&addr, &pos, lsave[i], c->lnum_bits); + crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + c->lsave_sz - UBIFS_LPT_CRC_BYTES); + addr = buf; + pos = 0; + pack_bits(&addr, &pos, crc, UBIFS_LPT_CRC_BITS); +} + +/** + * ubifs_add_lpt_dirt - add dirty space to LPT LEB properties. + * @c: UBIFS file-system description object + * @lnum: LEB number to which to add dirty space + * @dirty: amount of dirty space to add + */ +void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty) +{ + if (!dirty || !lnum) + return; + dbg_lp("LEB %d add %d to %d", + lnum, dirty, c->ltab[lnum - c->lpt_first].dirty); + ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); + c->ltab[lnum - c->lpt_first].dirty += dirty; +} + +/** + * set_ltab - set LPT LEB properties. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @free: amount of free space + * @dirty: amount of dirty space + */ +static void set_ltab(struct ubifs_info *c, int lnum, int free, int dirty) +{ + dbg_lp("LEB %d free %d dirty %d to %d %d", + lnum, c->ltab[lnum - c->lpt_first].free, + c->ltab[lnum - c->lpt_first].dirty, free, dirty); + ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); + c->ltab[lnum - c->lpt_first].free = free; + c->ltab[lnum - c->lpt_first].dirty = dirty; +} + +/** + * ubifs_add_nnode_dirt - add dirty space to LPT LEB properties. + * @c: UBIFS file-system description object + * @nnode: nnode for which to add dirt + */ +void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode) +{ + struct ubifs_nnode *np = nnode->parent; + + if (np) + ubifs_add_lpt_dirt(c, np->nbranch[nnode->iip].lnum, + c->nnode_sz); + else { + ubifs_add_lpt_dirt(c, c->lpt_lnum, c->nnode_sz); + if (!(c->lpt_drty_flgs & LTAB_DIRTY)) { + c->lpt_drty_flgs |= LTAB_DIRTY; + ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz); + } + } +} + +/** + * add_pnode_dirt - add dirty space to LPT LEB properties. + * @c: UBIFS file-system description object + * @pnode: pnode for which to add dirt + */ +static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode) +{ + ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum, + c->pnode_sz); +} + +/** + * calc_nnode_num - calculate nnode number. + * @row: the row in the tree (root is zero) + * @col: the column in the row (leftmost is zero) + * + * The nnode number is a number that uniquely identifies a nnode and can be used + * easily to traverse the tree from the root to that nnode. + * + * This function calculates and returns the nnode number for the nnode at @row + * and @col. + */ +static int calc_nnode_num(int row, int col) +{ + int num, bits; + + num = 1; + while (row--) { + bits = (col & (UBIFS_LPT_FANOUT - 1)); + col >>= UBIFS_LPT_FANOUT_SHIFT; + num <<= UBIFS_LPT_FANOUT_SHIFT; + num |= bits; + } + return num; +} + +/** + * calc_nnode_num_from_parent - calculate nnode number. + * @c: UBIFS file-system description object + * @parent: parent nnode + * @iip: index in parent + * + * The nnode number is a number that uniquely identifies a nnode and can be used + * easily to traverse the tree from the root to that nnode. + * + * This function calculates and returns the nnode number based on the parent's + * nnode number and the index in parent. + */ +static int calc_nnode_num_from_parent(const struct ubifs_info *c, + struct ubifs_nnode *parent, int iip) +{ + int num, shft; + + if (!parent) + return 1; + shft = (c->lpt_hght - parent->level) * UBIFS_LPT_FANOUT_SHIFT; + num = parent->num ^ (1 << shft); + num |= (UBIFS_LPT_FANOUT + iip) << shft; + return num; +} + +/** + * calc_pnode_num_from_parent - calculate pnode number. + * @c: UBIFS file-system description object + * @parent: parent nnode + * @iip: index in parent + * + * The pnode number is a number that uniquely identifies a pnode and can be used + * easily to traverse the tree from the root to that pnode. + * + * This function calculates and returns the pnode number based on the parent's + * nnode number and the index in parent. + */ +static int calc_pnode_num_from_parent(const struct ubifs_info *c, + struct ubifs_nnode *parent, int iip) +{ + int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0; + + for (i = 0; i < n; i++) { + num <<= UBIFS_LPT_FANOUT_SHIFT; + num |= pnum & (UBIFS_LPT_FANOUT - 1); + pnum >>= UBIFS_LPT_FANOUT_SHIFT; + } + num <<= UBIFS_LPT_FANOUT_SHIFT; + num |= iip; + return num; +} + +/** + * ubifs_create_dflt_lpt - create default LPT. + * @c: UBIFS file-system description object + * @main_lebs: number of main area LEBs is passed and returned here + * @lpt_first: LEB number of first LPT LEB + * @lpt_lebs: number of LEBs for LPT is passed and returned here + * @big_lpt: use big LPT model is passed and returned here + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, + int *lpt_lebs, int *big_lpt) +{ + int lnum, err = 0, node_sz, iopos, i, j, cnt, len, alen, row; + int blnum, boffs, bsz, bcnt; + struct ubifs_pnode *pnode = NULL; + struct ubifs_nnode *nnode = NULL; + void *buf = NULL, *p; + struct ubifs_lpt_lprops *ltab = NULL; + int *lsave = NULL; + + err = calc_dflt_lpt_geom(c, main_lebs, big_lpt); + if (err) + return err; + *lpt_lebs = c->lpt_lebs; + + /* Needed by 'ubifs_pack_nnode()' and 'set_ltab()' */ + c->lpt_first = lpt_first; + /* Needed by 'set_ltab()' */ + c->lpt_last = lpt_first + c->lpt_lebs - 1; + /* Needed by 'ubifs_pack_lsave()' */ + c->main_first = c->leb_cnt - *main_lebs; + + lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_KERNEL); + pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_KERNEL); + nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_KERNEL); + buf = vmalloc(c->leb_size); + ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + if (!pnode || !nnode || !buf || !ltab || !lsave) { + err = -ENOMEM; + goto out; + } + + ubifs_assert(!c->ltab); + c->ltab = ltab; /* Needed by set_ltab */ + + /* Initialize LPT's own lprops */ + for (i = 0; i < c->lpt_lebs; i++) { + ltab[i].free = c->leb_size; + ltab[i].dirty = 0; + ltab[i].tgc = 0; + ltab[i].cmt = 0; + } + + lnum = lpt_first; + p = buf; + /* Number of leaf nodes (pnodes) */ + cnt = c->pnode_cnt; + + /* + * The first pnode contains the LEB properties for the LEBs that contain + * the root inode node and the root index node of the index tree. + */ + node_sz = ALIGN(ubifs_idx_node_sz(c, 1), 8); + iopos = ALIGN(node_sz, c->min_io_size); + pnode->lprops[0].free = c->leb_size - iopos; + pnode->lprops[0].dirty = iopos - node_sz; + pnode->lprops[0].flags = LPROPS_INDEX; + + node_sz = UBIFS_INO_NODE_SZ; + iopos = ALIGN(node_sz, c->min_io_size); + pnode->lprops[1].free = c->leb_size - iopos; + pnode->lprops[1].dirty = iopos - node_sz; + + for (i = 2; i < UBIFS_LPT_FANOUT; i++) + pnode->lprops[i].free = c->leb_size; + + /* Add first pnode */ + ubifs_pack_pnode(c, p, pnode); + p += c->pnode_sz; + len = c->pnode_sz; + pnode->num += 1; + + /* Reset pnode values for remaining pnodes */ + pnode->lprops[0].free = c->leb_size; + pnode->lprops[0].dirty = 0; + pnode->lprops[0].flags = 0; + + pnode->lprops[1].free = c->leb_size; + pnode->lprops[1].dirty = 0; + + /* + * To calculate the internal node branches, we keep information about + * the level below. + */ + blnum = lnum; /* LEB number of level below */ + boffs = 0; /* Offset of level below */ + bcnt = cnt; /* Number of nodes in level below */ + bsz = c->pnode_sz; /* Size of nodes in level below */ + + /* Add all remaining pnodes */ + for (i = 1; i < cnt; i++) { + if (len + c->pnode_sz > c->leb_size) { + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, alen - len); + memset(p, 0xff, alen - len); + err = ubi_leb_change(c->ubi, lnum++, buf, alen, + UBI_SHORTTERM); + if (err) + goto out; + p = buf; + len = 0; + } + ubifs_pack_pnode(c, p, pnode); + p += c->pnode_sz; + len += c->pnode_sz; + /* + * pnodes are simply numbered left to right starting at zero, + * which means the pnode number can be used easily to traverse + * down the tree to the corresponding pnode. + */ + pnode->num += 1; + } + + row = 0; + for (i = UBIFS_LPT_FANOUT; cnt > i; i <<= UBIFS_LPT_FANOUT_SHIFT) + row += 1; + /* Add all nnodes, one level at a time */ + while (1) { + /* Number of internal nodes (nnodes) at next level */ + cnt = DIV_ROUND_UP(cnt, UBIFS_LPT_FANOUT); + for (i = 0; i < cnt; i++) { + if (len + c->nnode_sz > c->leb_size) { + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, + alen - len); + memset(p, 0xff, alen - len); + err = ubi_leb_change(c->ubi, lnum++, buf, alen, + UBI_SHORTTERM); + if (err) + goto out; + p = buf; + len = 0; + } + /* Only 1 nnode at this level, so it is the root */ + if (cnt == 1) { + c->lpt_lnum = lnum; + c->lpt_offs = len; + } + /* Set branches to the level below */ + for (j = 0; j < UBIFS_LPT_FANOUT; j++) { + if (bcnt) { + if (boffs + bsz > c->leb_size) { + blnum += 1; + boffs = 0; + } + nnode->nbranch[j].lnum = blnum; + nnode->nbranch[j].offs = boffs; + boffs += bsz; + bcnt--; + } else { + nnode->nbranch[j].lnum = 0; + nnode->nbranch[j].offs = 0; + } + } + nnode->num = calc_nnode_num(row, i); + ubifs_pack_nnode(c, p, nnode); + p += c->nnode_sz; + len += c->nnode_sz; + } + /* Only 1 nnode at this level, so it is the root */ + if (cnt == 1) + break; + /* Update the information about the level below */ + bcnt = cnt; + bsz = c->nnode_sz; + row -= 1; + } + + if (*big_lpt) { + /* Need to add LPT's save table */ + if (len + c->lsave_sz > c->leb_size) { + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, alen - len); + memset(p, 0xff, alen - len); + err = ubi_leb_change(c->ubi, lnum++, buf, alen, + UBI_SHORTTERM); + if (err) + goto out; + p = buf; + len = 0; + } + + c->lsave_lnum = lnum; + c->lsave_offs = len; + + for (i = 0; i < c->lsave_cnt && i < *main_lebs; i++) + lsave[i] = c->main_first + i; + for (; i < c->lsave_cnt; i++) + lsave[i] = c->main_first; + + ubifs_pack_lsave(c, p, lsave); + p += c->lsave_sz; + len += c->lsave_sz; + } + + /* Need to add LPT's own LEB properties table */ + if (len + c->ltab_sz > c->leb_size) { + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, alen - len); + memset(p, 0xff, alen - len); + err = ubi_leb_change(c->ubi, lnum++, buf, alen, UBI_SHORTTERM); + if (err) + goto out; + p = buf; + len = 0; + } + + c->ltab_lnum = lnum; + c->ltab_offs = len; + + /* Update ltab before packing it */ + len += c->ltab_sz; + alen = ALIGN(len, c->min_io_size); + set_ltab(c, lnum, c->leb_size - alen, alen - len); + + ubifs_pack_ltab(c, p, ltab); + p += c->ltab_sz; + + /* Write remaining buffer */ + memset(p, 0xff, alen - len); + err = ubi_leb_change(c->ubi, lnum, buf, alen, UBI_SHORTTERM); + if (err) + goto out; + + c->nhead_lnum = lnum; + c->nhead_offs = ALIGN(len, c->min_io_size); + + dbg_lp("space_bits %d", c->space_bits); + dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits); + dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits); + dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits); + dbg_lp("pcnt_bits %d", c->pcnt_bits); + dbg_lp("lnum_bits %d", c->lnum_bits); + dbg_lp("pnode_sz %d", c->pnode_sz); + dbg_lp("nnode_sz %d", c->nnode_sz); + dbg_lp("ltab_sz %d", c->ltab_sz); + dbg_lp("lsave_sz %d", c->lsave_sz); + dbg_lp("lsave_cnt %d", c->lsave_cnt); + dbg_lp("lpt_hght %d", c->lpt_hght); + dbg_lp("big_lpt %d", c->big_lpt); + dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs); + dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs); + dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs); +out: + c->ltab = NULL; + kfree(lsave); + vfree(ltab); + vfree(buf); + kfree(nnode); + kfree(pnode); + return err; +} + +/** + * update_cats - add LEB properties of a pnode to LEB category lists and heaps. + * @c: UBIFS file-system description object + * @pnode: pnode + * + * When a pnode is loaded into memory, the LEB properties it contains are added, + * by this function, to the LEB category lists and heaps. + */ +static void update_cats(struct ubifs_info *c, struct ubifs_pnode *pnode) +{ + int i; + + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int cat = pnode->lprops[i].flags & LPROPS_CAT_MASK; + int lnum = pnode->lprops[i].lnum; + + if (!lnum) + return; + ubifs_add_to_cat(c, &pnode->lprops[i], cat); + } +} + +/** + * replace_cats - add LEB properties of a pnode to LEB category lists and heaps. + * @c: UBIFS file-system description object + * @old_pnode: pnode copied + * @new_pnode: pnode copy + * + * During commit it is sometimes necessary to copy a pnode + * (see dirty_cow_pnode). When that happens, references in + * category lists and heaps must be replaced. This function does that. + */ +static void replace_cats(struct ubifs_info *c, struct ubifs_pnode *old_pnode, + struct ubifs_pnode *new_pnode) +{ + int i; + + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + if (!new_pnode->lprops[i].lnum) + return; + ubifs_replace_cat(c, &old_pnode->lprops[i], + &new_pnode->lprops[i]); + } +} + +/** + * check_lpt_crc - check LPT node crc is correct. + * @c: UBIFS file-system description object + * @buf: buffer containing node + * @len: length of node + * + * This function returns %0 on success and a negative error code on failure. + */ +static int check_lpt_crc(void *buf, int len) +{ + int pos = 0; + uint8_t *addr = buf; + uint16_t crc, calc_crc; + + crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS); + calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + len - UBIFS_LPT_CRC_BYTES); + if (crc != calc_crc) { + ubifs_err("invalid crc in LPT node: crc %hx calc %hx", crc, + calc_crc); + dbg_dump_stack(); + return -EINVAL; + } + return 0; +} + +/** + * check_lpt_type - check LPT node type is correct. + * @c: UBIFS file-system description object + * @addr: address of type bit field is passed and returned updated here + * @pos: position of type bit field is passed and returned updated here + * @type: expected type + * + * This function returns %0 on success and a negative error code on failure. + */ +static int check_lpt_type(uint8_t **addr, int *pos, int type) +{ + int node_type; + + node_type = ubifs_unpack_bits(addr, pos, UBIFS_LPT_TYPE_BITS); + if (node_type != type) { + ubifs_err("invalid type (%d) in LPT node type %d", node_type, + type); + dbg_dump_stack(); + return -EINVAL; + } + return 0; +} + +/** + * unpack_pnode - unpack a pnode. + * @c: UBIFS file-system description object + * @buf: buffer containing packed pnode to unpack + * @pnode: pnode structure to fill + * + * This function returns %0 on success and a negative error code on failure. + */ +static int unpack_pnode(const struct ubifs_info *c, void *buf, + struct ubifs_pnode *pnode) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0, err; + + err = check_lpt_type(&addr, &pos, UBIFS_LPT_PNODE); + if (err) + return err; + if (c->big_lpt) + pnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops * const lprops = &pnode->lprops[i]; + + lprops->free = ubifs_unpack_bits(&addr, &pos, c->space_bits); + lprops->free <<= 3; + lprops->dirty = ubifs_unpack_bits(&addr, &pos, c->space_bits); + lprops->dirty <<= 3; + + if (ubifs_unpack_bits(&addr, &pos, 1)) + lprops->flags = LPROPS_INDEX; + else + lprops->flags = 0; + lprops->flags |= ubifs_categorize_lprops(c, lprops); + } + err = check_lpt_crc(buf, c->pnode_sz); + return err; +} + +/** + * ubifs_unpack_nnode - unpack a nnode. + * @c: UBIFS file-system description object + * @buf: buffer containing packed nnode to unpack + * @nnode: nnode structure to fill + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0, err; + + err = check_lpt_type(&addr, &pos, UBIFS_LPT_NNODE); + if (err) + return err; + if (c->big_lpt) + nnode->num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int lnum; + + lnum = ubifs_unpack_bits(&addr, &pos, c->lpt_lnum_bits) + + c->lpt_first; + if (lnum == c->lpt_last + 1) + lnum = 0; + nnode->nbranch[i].lnum = lnum; + nnode->nbranch[i].offs = ubifs_unpack_bits(&addr, &pos, + c->lpt_offs_bits); + } + err = check_lpt_crc(buf, c->nnode_sz); + return err; +} + +/** + * unpack_ltab - unpack the LPT's own lprops table. + * @c: UBIFS file-system description object + * @buf: buffer from which to unpack + * + * This function returns %0 on success and a negative error code on failure. + */ +static int unpack_ltab(const struct ubifs_info *c, void *buf) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0, err; + + err = check_lpt_type(&addr, &pos, UBIFS_LPT_LTAB); + if (err) + return err; + for (i = 0; i < c->lpt_lebs; i++) { + int free = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits); + int dirty = ubifs_unpack_bits(&addr, &pos, c->lpt_spc_bits); + + if (free < 0 || free > c->leb_size || dirty < 0 || + dirty > c->leb_size || free + dirty > c->leb_size) + return -EINVAL; + + c->ltab[i].free = free; + c->ltab[i].dirty = dirty; + c->ltab[i].tgc = 0; + c->ltab[i].cmt = 0; + } + err = check_lpt_crc(buf, c->ltab_sz); + return err; +} + +/** + * unpack_lsave - unpack the LPT's save table. + * @c: UBIFS file-system description object + * @buf: buffer from which to unpack + * + * This function returns %0 on success and a negative error code on failure. + */ +static int unpack_lsave(const struct ubifs_info *c, void *buf) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int i, pos = 0, err; + + err = check_lpt_type(&addr, &pos, UBIFS_LPT_LSAVE); + if (err) + return err; + for (i = 0; i < c->lsave_cnt; i++) { + int lnum = ubifs_unpack_bits(&addr, &pos, c->lnum_bits); + + if (lnum < c->main_first || lnum >= c->leb_cnt) + return -EINVAL; + c->lsave[i] = lnum; + } + err = check_lpt_crc(buf, c->lsave_sz); + return err; +} + +/** + * validate_nnode - validate a nnode. + * @c: UBIFS file-system description object + * @nnode: nnode to validate + * @parent: parent nnode (or NULL for the root nnode) + * @iip: index in parent + * + * This function returns %0 on success and a negative error code on failure. + */ +static int validate_nnode(const struct ubifs_info *c, struct ubifs_nnode *nnode, + struct ubifs_nnode *parent, int iip) +{ + int i, lvl, max_offs; + + if (c->big_lpt) { + int num = calc_nnode_num_from_parent(c, parent, iip); + + if (nnode->num != num) + return -EINVAL; + } + lvl = parent ? parent->level - 1 : c->lpt_hght; + if (lvl < 1) + return -EINVAL; + if (lvl == 1) + max_offs = c->leb_size - c->pnode_sz; + else + max_offs = c->leb_size - c->nnode_sz; + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int lnum = nnode->nbranch[i].lnum; + int offs = nnode->nbranch[i].offs; + + if (lnum == 0) { + if (offs != 0) + return -EINVAL; + continue; + } + if (lnum < c->lpt_first || lnum > c->lpt_last) + return -EINVAL; + if (offs < 0 || offs > max_offs) + return -EINVAL; + } + return 0; +} + +/** + * validate_pnode - validate a pnode. + * @c: UBIFS file-system description object + * @pnode: pnode to validate + * @parent: parent nnode + * @iip: index in parent + * + * This function returns %0 on success and a negative error code on failure. + */ +static int validate_pnode(const struct ubifs_info *c, struct ubifs_pnode *pnode, + struct ubifs_nnode *parent, int iip) +{ + int i; + + if (c->big_lpt) { + int num = calc_pnode_num_from_parent(c, parent, iip); + + if (pnode->num != num) + return -EINVAL; + } + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + int free = pnode->lprops[i].free; + int dirty = pnode->lprops[i].dirty; + + if (free < 0 || free > c->leb_size || free % c->min_io_size || + (free & 7)) + return -EINVAL; + if (dirty < 0 || dirty > c->leb_size || (dirty & 7)) + return -EINVAL; + if (dirty + free > c->leb_size) + return -EINVAL; + } + return 0; +} + +/** + * set_pnode_lnum - set LEB numbers on a pnode. + * @c: UBIFS file-system description object + * @pnode: pnode to update + * + * This function calculates the LEB numbers for the LEB properties it contains + * based on the pnode number. + */ +static void set_pnode_lnum(const struct ubifs_info *c, + struct ubifs_pnode *pnode) +{ + int i, lnum; + + lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + c->main_first; + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + if (lnum >= c->leb_cnt) + return; + pnode->lprops[i].lnum = lnum++; + } +} + +/** + * ubifs_read_nnode - read a nnode from flash and link it to the tree in memory. + * @c: UBIFS file-system description object + * @parent: parent nnode (or NULL for the root) + * @iip: index in parent + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch = NULL; + struct ubifs_nnode *nnode = NULL; + void *buf = c->lpt_nod_buf; + int err, lnum, offs; + + if (parent) { + branch = &parent->nbranch[iip]; + lnum = branch->lnum; + offs = branch->offs; + } else { + lnum = c->lpt_lnum; + offs = c->lpt_offs; + } + nnode = kzalloc(sizeof(struct ubifs_nnode), GFP_NOFS); + if (!nnode) { + err = -ENOMEM; + goto out; + } + if (lnum == 0) { + /* + * This nnode was not written which just means that the LEB + * properties in the subtree below it describe empty LEBs. We + * make the nnode as though we had read it, which in fact means + * doing almost nothing. + */ + if (c->big_lpt) + nnode->num = calc_nnode_num_from_parent(c, parent, iip); + } else { + err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz); + if (err) + goto out; + err = ubifs_unpack_nnode(c, buf, nnode); + if (err) + goto out; + } + err = validate_nnode(c, nnode, parent, iip); + if (err) + goto out; + if (!c->big_lpt) + nnode->num = calc_nnode_num_from_parent(c, parent, iip); + if (parent) { + branch->nnode = nnode; + nnode->level = parent->level - 1; + } else { + c->nroot = nnode; + nnode->level = c->lpt_hght; + } + nnode->parent = parent; + nnode->iip = iip; + return 0; + +out: + ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs); + kfree(nnode); + return err; +} + +/** + * read_pnode - read a pnode from flash and link it to the tree in memory. + * @c: UBIFS file-system description object + * @parent: parent nnode + * @iip: index in parent + * + * This function returns %0 on success and a negative error code on failure. + */ +static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_pnode *pnode = NULL; + void *buf = c->lpt_nod_buf; + int err, lnum, offs; + + branch = &parent->nbranch[iip]; + lnum = branch->lnum; + offs = branch->offs; + pnode = kzalloc(sizeof(struct ubifs_pnode), GFP_NOFS); + if (!pnode) { + err = -ENOMEM; + goto out; + } + if (lnum == 0) { + /* + * This pnode was not written which just means that the LEB + * properties in it describe empty LEBs. We make the pnode as + * though we had read it. + */ + int i; + + if (c->big_lpt) + pnode->num = calc_pnode_num_from_parent(c, parent, iip); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops * const lprops = &pnode->lprops[i]; + + lprops->free = c->leb_size; + lprops->flags = ubifs_categorize_lprops(c, lprops); + } + } else { + err = ubi_read(c->ubi, lnum, buf, offs, c->pnode_sz); + if (err) + goto out; + err = unpack_pnode(c, buf, pnode); + if (err) + goto out; + } + err = validate_pnode(c, pnode, parent, iip); + if (err) + goto out; + if (!c->big_lpt) + pnode->num = calc_pnode_num_from_parent(c, parent, iip); + branch->pnode = pnode; + pnode->parent = parent; + pnode->iip = iip; + set_pnode_lnum(c, pnode); + c->pnodes_have += 1; + return 0; + +out: + ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs); + dbg_dump_pnode(c, pnode, parent, iip); + dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip)); + kfree(pnode); + return err; +} + +/** + * read_ltab - read LPT's own lprops table. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int read_ltab(struct ubifs_info *c) +{ + int err; + void *buf; + + buf = vmalloc(c->ltab_sz); + if (!buf) + return -ENOMEM; + err = ubi_read(c->ubi, c->ltab_lnum, buf, c->ltab_offs, c->ltab_sz); + if (err) + goto out; + err = unpack_ltab(c, buf); +out: + vfree(buf); + return err; +} + +/** + * read_lsave - read LPT's save table. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int read_lsave(struct ubifs_info *c) +{ + int err, i; + void *buf; + + buf = vmalloc(c->lsave_sz); + if (!buf) + return -ENOMEM; + err = ubi_read(c->ubi, c->lsave_lnum, buf, c->lsave_offs, c->lsave_sz); + if (err) + goto out; + err = unpack_lsave(c, buf); + if (err) + goto out; + for (i = 0; i < c->lsave_cnt; i++) { + int lnum = c->lsave[i]; + + /* + * Due to automatic resizing, the values in the lsave table + * could be beyond the volume size - just ignore them. + */ + if (lnum >= c->leb_cnt) + continue; + ubifs_lpt_lookup(c, lnum); + } +out: + vfree(buf); + return err; +} + +/** + * ubifs_get_nnode - get a nnode. + * @c: UBIFS file-system description object + * @parent: parent nnode (or NULL for the root) + * @iip: index in parent + * + * This function returns a pointer to the nnode on success or a negative error + * code on failure. + */ +struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c, + struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_nnode *nnode; + int err; + + branch = &parent->nbranch[iip]; + nnode = branch->nnode; + if (nnode) + return nnode; + err = ubifs_read_nnode(c, parent, iip); + if (err) + return ERR_PTR(err); + return branch->nnode; +} + +/** + * ubifs_get_pnode - get a pnode. + * @c: UBIFS file-system description object + * @parent: parent nnode + * @iip: index in parent + * + * This function returns a pointer to the pnode on success or a negative error + * code on failure. + */ +struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c, + struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_pnode *pnode; + int err; + + branch = &parent->nbranch[iip]; + pnode = branch->pnode; + if (pnode) + return pnode; + err = read_pnode(c, parent, iip); + if (err) + return ERR_PTR(err); + update_cats(c, branch->pnode); + return branch->pnode; +} + +/** + * ubifs_lpt_lookup - lookup LEB properties in the LPT. + * @c: UBIFS file-system description object + * @lnum: LEB number to lookup + * + * This function returns a pointer to the LEB properties on success or a + * negative error code on failure. + */ +struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum) +{ + int err, i, h, iip, shft; + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return ERR_PTR(err); + } + nnode = c->nroot; + i = lnum - c->main_first; + shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; + for (h = 1; h < c->lpt_hght; h++) { + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return ERR_PTR(PTR_ERR(nnode)); + } + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + pnode = ubifs_get_pnode(c, nnode, iip); + if (IS_ERR(pnode)) + return ERR_PTR(PTR_ERR(pnode)); + iip = (i & (UBIFS_LPT_FANOUT - 1)); + dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum, + pnode->lprops[iip].free, pnode->lprops[iip].dirty, + pnode->lprops[iip].flags); + return &pnode->lprops[iip]; +} + +/** + * dirty_cow_nnode - ensure a nnode is not being committed. + * @c: UBIFS file-system description object + * @nnode: nnode to check + * + * Returns dirtied nnode on success or negative error code on failure. + */ +static struct ubifs_nnode *dirty_cow_nnode(struct ubifs_info *c, + struct ubifs_nnode *nnode) +{ + struct ubifs_nnode *n; + int i; + + if (!test_bit(COW_CNODE, &nnode->flags)) { + /* nnode is not being committed */ + if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) { + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + } + return nnode; + } + + /* nnode is being committed, so copy it */ + n = kmalloc(sizeof(struct ubifs_nnode), GFP_NOFS); + if (unlikely(!n)) + return ERR_PTR(-ENOMEM); + + memcpy(n, nnode, sizeof(struct ubifs_nnode)); + n->cnext = NULL; + __set_bit(DIRTY_CNODE, &n->flags); + __clear_bit(COW_CNODE, &n->flags); + + /* The children now have new parent */ + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_nbranch *branch = &n->nbranch[i]; + + if (branch->cnode) + branch->cnode->parent = n; + } + + ubifs_assert(!test_bit(OBSOLETE_CNODE, &nnode->flags)); + __set_bit(OBSOLETE_CNODE, &nnode->flags); + + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + if (nnode->parent) + nnode->parent->nbranch[n->iip].nnode = n; + else + c->nroot = n; + return n; +} + +/** + * dirty_cow_pnode - ensure a pnode is not being committed. + * @c: UBIFS file-system description object + * @pnode: pnode to check + * + * Returns dirtied pnode on success or negative error code on failure. + */ +static struct ubifs_pnode *dirty_cow_pnode(struct ubifs_info *c, + struct ubifs_pnode *pnode) +{ + struct ubifs_pnode *p; + + if (!test_bit(COW_CNODE, &pnode->flags)) { + /* pnode is not being committed */ + if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) { + c->dirty_pn_cnt += 1; + add_pnode_dirt(c, pnode); + } + return pnode; + } + + /* pnode is being committed, so copy it */ + p = kmalloc(sizeof(struct ubifs_pnode), GFP_NOFS); + if (unlikely(!p)) + return ERR_PTR(-ENOMEM); + + memcpy(p, pnode, sizeof(struct ubifs_pnode)); + p->cnext = NULL; + __set_bit(DIRTY_CNODE, &p->flags); + __clear_bit(COW_CNODE, &p->flags); + replace_cats(c, pnode, p); + + ubifs_assert(!test_bit(OBSOLETE_CNODE, &pnode->flags)); + __set_bit(OBSOLETE_CNODE, &pnode->flags); + + c->dirty_pn_cnt += 1; + add_pnode_dirt(c, pnode); + pnode->parent->nbranch[p->iip].pnode = p; + return p; +} + +/** + * ubifs_lpt_lookup_dirty - lookup LEB properties in the LPT. + * @c: UBIFS file-system description object + * @lnum: LEB number to lookup + * + * This function returns a pointer to the LEB properties on success or a + * negative error code on failure. + */ +struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum) +{ + int err, i, h, iip, shft; + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return ERR_PTR(err); + } + nnode = c->nroot; + nnode = dirty_cow_nnode(c, nnode); + if (IS_ERR(nnode)) + return ERR_PTR(PTR_ERR(nnode)); + i = lnum - c->main_first; + shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; + for (h = 1; h < c->lpt_hght; h++) { + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return ERR_PTR(PTR_ERR(nnode)); + nnode = dirty_cow_nnode(c, nnode); + if (IS_ERR(nnode)) + return ERR_PTR(PTR_ERR(nnode)); + } + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + pnode = ubifs_get_pnode(c, nnode, iip); + if (IS_ERR(pnode)) + return ERR_PTR(PTR_ERR(pnode)); + pnode = dirty_cow_pnode(c, pnode); + if (IS_ERR(pnode)) + return ERR_PTR(PTR_ERR(pnode)); + iip = (i & (UBIFS_LPT_FANOUT - 1)); + dbg_lp("LEB %d, free %d, dirty %d, flags %d", lnum, + pnode->lprops[iip].free, pnode->lprops[iip].dirty, + pnode->lprops[iip].flags); + ubifs_assert(test_bit(DIRTY_CNODE, &pnode->flags)); + return &pnode->lprops[iip]; +} + +/** + * lpt_init_rd - initialize the LPT for reading. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int lpt_init_rd(struct ubifs_info *c) +{ + int err, i; + + c->ltab = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + if (!c->ltab) + return -ENOMEM; + + i = max_t(int, c->nnode_sz, c->pnode_sz); + c->lpt_nod_buf = kmalloc(i, GFP_KERNEL); + if (!c->lpt_nod_buf) + return -ENOMEM; + + for (i = 0; i < LPROPS_HEAP_CNT; i++) { + c->lpt_heap[i].arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ, + GFP_KERNEL); + if (!c->lpt_heap[i].arr) + return -ENOMEM; + c->lpt_heap[i].cnt = 0; + c->lpt_heap[i].max_cnt = LPT_HEAP_SZ; + } + + c->dirty_idx.arr = kmalloc(sizeof(void *) * LPT_HEAP_SZ, GFP_KERNEL); + if (!c->dirty_idx.arr) + return -ENOMEM; + c->dirty_idx.cnt = 0; + c->dirty_idx.max_cnt = LPT_HEAP_SZ; + + err = read_ltab(c); + if (err) + return err; + + dbg_lp("space_bits %d", c->space_bits); + dbg_lp("lpt_lnum_bits %d", c->lpt_lnum_bits); + dbg_lp("lpt_offs_bits %d", c->lpt_offs_bits); + dbg_lp("lpt_spc_bits %d", c->lpt_spc_bits); + dbg_lp("pcnt_bits %d", c->pcnt_bits); + dbg_lp("lnum_bits %d", c->lnum_bits); + dbg_lp("pnode_sz %d", c->pnode_sz); + dbg_lp("nnode_sz %d", c->nnode_sz); + dbg_lp("ltab_sz %d", c->ltab_sz); + dbg_lp("lsave_sz %d", c->lsave_sz); + dbg_lp("lsave_cnt %d", c->lsave_cnt); + dbg_lp("lpt_hght %d", c->lpt_hght); + dbg_lp("big_lpt %d", c->big_lpt); + dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs); + dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs); + dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs); + + return 0; +} + +/** + * lpt_init_wr - initialize the LPT for writing. + * @c: UBIFS file-system description object + * + * 'lpt_init_rd()' must have been called already. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int lpt_init_wr(struct ubifs_info *c) +{ + int err, i; + + c->ltab_cmt = vmalloc(sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + if (!c->ltab_cmt) + return -ENOMEM; + + c->lpt_buf = vmalloc(c->leb_size); + if (!c->lpt_buf) + return -ENOMEM; + + if (c->big_lpt) { + c->lsave = kmalloc(sizeof(int) * c->lsave_cnt, GFP_NOFS); + if (!c->lsave) + return -ENOMEM; + err = read_lsave(c); + if (err) + return err; + } + + for (i = 0; i < c->lpt_lebs; i++) + if (c->ltab[i].free == c->leb_size) { + err = ubifs_leb_unmap(c, i + c->lpt_first); + if (err) + return err; + } + + return 0; +} + +/** + * ubifs_lpt_init - initialize the LPT. + * @c: UBIFS file-system description object + * @rd: whether to initialize lpt for reading + * @wr: whether to initialize lpt for writing + * + * For mounting 'rw', @rd and @wr are both true. For mounting 'ro', @rd is true + * and @wr is false. For mounting from 'ro' to 'rw', @rd is false and @wr is + * true. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr) +{ + int err; + + if (rd) { + err = lpt_init_rd(c); + if (err) + return err; + } + + if (wr) { + err = lpt_init_wr(c); + if (err) + return err; + } + + return 0; +} + +/** + * struct lpt_scan_node - somewhere to put nodes while we scan LPT. + * @nnode: where to keep a nnode + * @pnode: where to keep a pnode + * @cnode: where to keep a cnode + * @in_tree: is the node in the tree in memory + * @ptr.nnode: pointer to the nnode (if it is an nnode) which may be here or in + * the tree + * @ptr.pnode: ditto for pnode + * @ptr.cnode: ditto for cnode + */ +struct lpt_scan_node { + union { + struct ubifs_nnode nnode; + struct ubifs_pnode pnode; + struct ubifs_cnode cnode; + }; + int in_tree; + union { + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + struct ubifs_cnode *cnode; + } ptr; +}; + +/** + * scan_get_nnode - for the scan, get a nnode from either the tree or flash. + * @c: the UBIFS file-system description object + * @path: where to put the nnode + * @parent: parent of the nnode + * @iip: index in parent of the nnode + * + * This function returns a pointer to the nnode on success or a negative error + * code on failure. + */ +static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c, + struct lpt_scan_node *path, + struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_nnode *nnode; + void *buf = c->lpt_nod_buf; + int err; + + branch = &parent->nbranch[iip]; + nnode = branch->nnode; + if (nnode) { + path->in_tree = 1; + path->ptr.nnode = nnode; + return nnode; + } + nnode = &path->nnode; + path->in_tree = 0; + path->ptr.nnode = nnode; + memset(nnode, 0, sizeof(struct ubifs_nnode)); + if (branch->lnum == 0) { + /* + * This nnode was not written which just means that the LEB + * properties in the subtree below it describe empty LEBs. We + * make the nnode as though we had read it, which in fact means + * doing almost nothing. + */ + if (c->big_lpt) + nnode->num = calc_nnode_num_from_parent(c, parent, iip); + } else { + err = ubi_read(c->ubi, branch->lnum, buf, branch->offs, + c->nnode_sz); + if (err) + return ERR_PTR(err); + err = ubifs_unpack_nnode(c, buf, nnode); + if (err) + return ERR_PTR(err); + } + err = validate_nnode(c, nnode, parent, iip); + if (err) + return ERR_PTR(err); + if (!c->big_lpt) + nnode->num = calc_nnode_num_from_parent(c, parent, iip); + nnode->level = parent->level - 1; + nnode->parent = parent; + nnode->iip = iip; + return nnode; +} + +/** + * scan_get_pnode - for the scan, get a pnode from either the tree or flash. + * @c: the UBIFS file-system description object + * @path: where to put the pnode + * @parent: parent of the pnode + * @iip: index in parent of the pnode + * + * This function returns a pointer to the pnode on success or a negative error + * code on failure. + */ +static struct ubifs_pnode *scan_get_pnode(struct ubifs_info *c, + struct lpt_scan_node *path, + struct ubifs_nnode *parent, int iip) +{ + struct ubifs_nbranch *branch; + struct ubifs_pnode *pnode; + void *buf = c->lpt_nod_buf; + int err; + + branch = &parent->nbranch[iip]; + pnode = branch->pnode; + if (pnode) { + path->in_tree = 1; + path->ptr.pnode = pnode; + return pnode; + } + pnode = &path->pnode; + path->in_tree = 0; + path->ptr.pnode = pnode; + memset(pnode, 0, sizeof(struct ubifs_pnode)); + if (branch->lnum == 0) { + /* + * This pnode was not written which just means that the LEB + * properties in it describe empty LEBs. We make the pnode as + * though we had read it. + */ + int i; + + if (c->big_lpt) + pnode->num = calc_pnode_num_from_parent(c, parent, iip); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops * const lprops = &pnode->lprops[i]; + + lprops->free = c->leb_size; + lprops->flags = ubifs_categorize_lprops(c, lprops); + } + } else { + ubifs_assert(branch->lnum >= c->lpt_first && + branch->lnum <= c->lpt_last); + ubifs_assert(branch->offs >= 0 && branch->offs < c->leb_size); + err = ubi_read(c->ubi, branch->lnum, buf, branch->offs, + c->pnode_sz); + if (err) + return ERR_PTR(err); + err = unpack_pnode(c, buf, pnode); + if (err) + return ERR_PTR(err); + } + err = validate_pnode(c, pnode, parent, iip); + if (err) + return ERR_PTR(err); + if (!c->big_lpt) + pnode->num = calc_pnode_num_from_parent(c, parent, iip); + pnode->parent = parent; + pnode->iip = iip; + set_pnode_lnum(c, pnode); + return pnode; +} + +/** + * ubifs_lpt_scan_nolock - scan the LPT. + * @c: the UBIFS file-system description object + * @start_lnum: LEB number from which to start scanning + * @end_lnum: LEB number at which to stop scanning + * @scan_cb: callback function called for each lprops + * @data: data to be passed to the callback function + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum, + ubifs_lpt_scan_callback scan_cb, void *data) +{ + int err = 0, i, h, iip, shft; + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + struct lpt_scan_node *path; + + if (start_lnum == -1) { + start_lnum = end_lnum + 1; + if (start_lnum >= c->leb_cnt) + start_lnum = c->main_first; + } + + ubifs_assert(start_lnum >= c->main_first && start_lnum < c->leb_cnt); + ubifs_assert(end_lnum >= c->main_first && end_lnum < c->leb_cnt); + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return err; + } + + path = kmalloc(sizeof(struct lpt_scan_node) * (c->lpt_hght + 1), + GFP_NOFS); + if (!path) + return -ENOMEM; + + path[0].ptr.nnode = c->nroot; + path[0].in_tree = 1; +again: + /* Descend to the pnode containing start_lnum */ + nnode = c->nroot; + i = start_lnum - c->main_first; + shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; + for (h = 1; h < c->lpt_hght; h++) { + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + nnode = scan_get_nnode(c, path + h, nnode, iip); + if (IS_ERR(nnode)) { + err = PTR_ERR(nnode); + goto out; + } + } + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + pnode = scan_get_pnode(c, path + h, nnode, iip); + if (IS_ERR(pnode)) { + err = PTR_ERR(pnode); + goto out; + } + iip = (i & (UBIFS_LPT_FANOUT - 1)); + + /* Loop for each lprops */ + while (1) { + struct ubifs_lprops *lprops = &pnode->lprops[iip]; + int ret, lnum = lprops->lnum; + + ret = scan_cb(c, lprops, path[h].in_tree, data); + if (ret < 0) { + err = ret; + goto out; + } + if (ret & LPT_SCAN_ADD) { + /* Add all the nodes in path to the tree in memory */ + for (h = 1; h < c->lpt_hght; h++) { + const size_t sz = sizeof(struct ubifs_nnode); + struct ubifs_nnode *parent; + + if (path[h].in_tree) + continue; + nnode = kmalloc(sz, GFP_NOFS); + if (!nnode) { + err = -ENOMEM; + goto out; + } + memcpy(nnode, &path[h].nnode, sz); + parent = nnode->parent; + parent->nbranch[nnode->iip].nnode = nnode; + path[h].ptr.nnode = nnode; + path[h].in_tree = 1; + path[h + 1].cnode.parent = nnode; + } + if (path[h].in_tree) + ubifs_ensure_cat(c, lprops); + else { + const size_t sz = sizeof(struct ubifs_pnode); + struct ubifs_nnode *parent; + + pnode = kmalloc(sz, GFP_NOFS); + if (!pnode) { + err = -ENOMEM; + goto out; + } + memcpy(pnode, &path[h].pnode, sz); + parent = pnode->parent; + parent->nbranch[pnode->iip].pnode = pnode; + path[h].ptr.pnode = pnode; + path[h].in_tree = 1; + update_cats(c, pnode); + c->pnodes_have += 1; + } + err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *) + c->nroot, 0, 0); + if (err) + goto out; + err = dbg_check_cats(c); + if (err) + goto out; + } + if (ret & LPT_SCAN_STOP) { + err = 0; + break; + } + /* Get the next lprops */ + if (lnum == end_lnum) { + /* + * We got to the end without finding what we were + * looking for + */ + err = -ENOSPC; + goto out; + } + if (lnum + 1 >= c->leb_cnt) { + /* Wrap-around to the beginning */ + start_lnum = c->main_first; + goto again; + } + if (iip + 1 < UBIFS_LPT_FANOUT) { + /* Next lprops is in the same pnode */ + iip += 1; + continue; + } + /* We need to get the next pnode. Go up until we can go right */ + iip = pnode->iip; + while (1) { + h -= 1; + ubifs_assert(h >= 0); + nnode = path[h].ptr.nnode; + if (iip + 1 < UBIFS_LPT_FANOUT) + break; + iip = nnode->iip; + } + /* Go right */ + iip += 1; + /* Descend to the pnode */ + h += 1; + for (; h < c->lpt_hght; h++) { + nnode = scan_get_nnode(c, path + h, nnode, iip); + if (IS_ERR(nnode)) { + err = PTR_ERR(nnode); + goto out; + } + iip = 0; + } + pnode = scan_get_pnode(c, path + h, nnode, iip); + if (IS_ERR(pnode)) { + err = PTR_ERR(pnode); + goto out; + } + iip = 0; + } +out: + kfree(path); + return err; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +/** + * dbg_chk_pnode - check a pnode. + * @c: the UBIFS file-system description object + * @pnode: pnode to check + * @col: pnode column + * + * This function returns %0 on success and a negative error code on failure. + */ +static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, + int col) +{ + int i; + + if (pnode->num != col) { + dbg_err("pnode num %d expected %d parent num %d iip %d", + pnode->num, col, pnode->parent->num, pnode->iip); + return -EINVAL; + } + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_lprops *lp, *lprops = &pnode->lprops[i]; + int lnum = (pnode->num << UBIFS_LPT_FANOUT_SHIFT) + i + + c->main_first; + int found, cat = lprops->flags & LPROPS_CAT_MASK; + struct ubifs_lpt_heap *heap; + struct list_head *list = NULL; + + if (lnum >= c->leb_cnt) + continue; + if (lprops->lnum != lnum) { + dbg_err("bad LEB number %d expected %d", + lprops->lnum, lnum); + return -EINVAL; + } + if (lprops->flags & LPROPS_TAKEN) { + if (cat != LPROPS_UNCAT) { + dbg_err("LEB %d taken but not uncat %d", + lprops->lnum, cat); + return -EINVAL; + } + continue; + } + if (lprops->flags & LPROPS_INDEX) { + switch (cat) { + case LPROPS_UNCAT: + case LPROPS_DIRTY_IDX: + case LPROPS_FRDI_IDX: + break; + default: + dbg_err("LEB %d index but cat %d", + lprops->lnum, cat); + return -EINVAL; + } + } else { + switch (cat) { + case LPROPS_UNCAT: + case LPROPS_DIRTY: + case LPROPS_FREE: + case LPROPS_EMPTY: + case LPROPS_FREEABLE: + break; + default: + dbg_err("LEB %d not index but cat %d", + lprops->lnum, cat); + return -EINVAL; + } + } + switch (cat) { + case LPROPS_UNCAT: + list = &c->uncat_list; + break; + case LPROPS_EMPTY: + list = &c->empty_list; + break; + case LPROPS_FREEABLE: + list = &c->freeable_list; + break; + case LPROPS_FRDI_IDX: + list = &c->frdi_idx_list; + break; + } + found = 0; + switch (cat) { + case LPROPS_DIRTY: + case LPROPS_DIRTY_IDX: + case LPROPS_FREE: + heap = &c->lpt_heap[cat - 1]; + if (lprops->hpos < heap->cnt && + heap->arr[lprops->hpos] == lprops) + found = 1; + break; + case LPROPS_UNCAT: + case LPROPS_EMPTY: + case LPROPS_FREEABLE: + case LPROPS_FRDI_IDX: + list_for_each_entry(lp, list, list) + if (lprops == lp) { + found = 1; + break; + } + break; + } + if (!found) { + dbg_err("LEB %d cat %d not found in cat heap/list", + lprops->lnum, cat); + return -EINVAL; + } + switch (cat) { + case LPROPS_EMPTY: + if (lprops->free != c->leb_size) { + dbg_err("LEB %d cat %d free %d dirty %d", + lprops->lnum, cat, lprops->free, + lprops->dirty); + return -EINVAL; + } + case LPROPS_FREEABLE: + case LPROPS_FRDI_IDX: + if (lprops->free + lprops->dirty != c->leb_size) { + dbg_err("LEB %d cat %d free %d dirty %d", + lprops->lnum, cat, lprops->free, + lprops->dirty); + return -EINVAL; + } + } + } + return 0; +} + +/** + * dbg_check_lpt_nodes - check nnodes and pnodes. + * @c: the UBIFS file-system description object + * @cnode: next cnode (nnode or pnode) to check + * @row: row of cnode (root is zero) + * @col: column of cnode (leftmost is zero) + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, + int row, int col) +{ + struct ubifs_nnode *nnode, *nn; + struct ubifs_cnode *cn; + int num, iip = 0, err; + + if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + return 0; + + while (cnode) { + ubifs_assert(row >= 0); + nnode = cnode->parent; + if (cnode->level) { + /* cnode is a nnode */ + num = calc_nnode_num(row, col); + if (cnode->num != num) { + dbg_err("nnode num %d expected %d " + "parent num %d iip %d", cnode->num, num, + (nnode ? nnode->num : 0), cnode->iip); + return -EINVAL; + } + nn = (struct ubifs_nnode *)cnode; + while (iip < UBIFS_LPT_FANOUT) { + cn = nn->nbranch[iip].cnode; + if (cn) { + /* Go down */ + row += 1; + col <<= UBIFS_LPT_FANOUT_SHIFT; + col += iip; + iip = 0; + cnode = cn; + break; + } + /* Go right */ + iip += 1; + } + if (iip < UBIFS_LPT_FANOUT) + continue; + } else { + struct ubifs_pnode *pnode; + + /* cnode is a pnode */ + pnode = (struct ubifs_pnode *)cnode; + err = dbg_chk_pnode(c, pnode, col); + if (err) + return err; + } + /* Go up and to the right */ + row -= 1; + col >>= UBIFS_LPT_FANOUT_SHIFT; + iip = cnode->iip + 1; + cnode = (struct ubifs_cnode *)nnode; + } + return 0; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff -Nurd linux-2.6.24/fs/ubifs/lpt_commit.c ubifs-v2.6.24/fs/ubifs/lpt_commit.c --- linux-2.6.24/fs/ubifs/lpt_commit.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/lpt_commit.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,1973 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements commit-related functionality of the LEB properties + * subsystem. + */ + +#include +#include "ubifs.h" + +/** + * first_dirty_cnode - find first dirty cnode. + * @c: UBIFS file-system description object + * @nnode: nnode at which to start + * + * This function returns the first dirty cnode or %NULL if there is not one. + */ +static struct ubifs_cnode *first_dirty_cnode(struct ubifs_nnode *nnode) +{ + ubifs_assert(nnode); + while (1) { + int i, cont = 0; + + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + struct ubifs_cnode *cnode; + + cnode = nnode->nbranch[i].cnode; + if (cnode && + test_bit(DIRTY_CNODE, &cnode->flags)) { + if (cnode->level == 0) + return cnode; + nnode = (struct ubifs_nnode *)cnode; + cont = 1; + break; + } + } + if (!cont) + return (struct ubifs_cnode *)nnode; + } +} + +/** + * next_dirty_cnode - find next dirty cnode. + * @cnode: cnode from which to begin searching + * + * This function returns the next dirty cnode or %NULL if there is not one. + */ +static struct ubifs_cnode *next_dirty_cnode(struct ubifs_cnode *cnode) +{ + struct ubifs_nnode *nnode; + int i; + + ubifs_assert(cnode); + nnode = cnode->parent; + if (!nnode) + return NULL; + for (i = cnode->iip + 1; i < UBIFS_LPT_FANOUT; i++) { + cnode = nnode->nbranch[i].cnode; + if (cnode && test_bit(DIRTY_CNODE, &cnode->flags)) { + if (cnode->level == 0) + return cnode; /* cnode is a pnode */ + /* cnode is a nnode */ + return first_dirty_cnode((struct ubifs_nnode *)cnode); + } + } + return (struct ubifs_cnode *)nnode; +} + +/** + * get_cnodes_to_commit - create list of dirty cnodes to commit. + * @c: UBIFS file-system description object + * + * This function returns the number of cnodes to commit. + */ +static int get_cnodes_to_commit(struct ubifs_info *c) +{ + struct ubifs_cnode *cnode, *cnext; + int cnt = 0; + + if (!c->nroot) + return 0; + + if (!test_bit(DIRTY_CNODE, &c->nroot->flags)) + return 0; + + c->lpt_cnext = first_dirty_cnode(c->nroot); + cnode = c->lpt_cnext; + if (!cnode) + return 0; + cnt += 1; + while (1) { + ubifs_assert(!test_bit(COW_ZNODE, &cnode->flags)); + __set_bit(COW_ZNODE, &cnode->flags); + cnext = next_dirty_cnode(cnode); + if (!cnext) { + cnode->cnext = c->lpt_cnext; + break; + } + cnode->cnext = cnext; + cnode = cnext; + cnt += 1; + } + dbg_cmt("committing %d cnodes", cnt); + dbg_lp("committing %d cnodes", cnt); + ubifs_assert(cnt == c->dirty_nn_cnt + c->dirty_pn_cnt); + return cnt; +} + +/** + * upd_ltab - update LPT LEB properties. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @free: amount of free space + * @dirty: amount of dirty space to add + */ +static void upd_ltab(struct ubifs_info *c, int lnum, int free, int dirty) +{ + dbg_lp("LEB %d free %d dirty %d to %d +%d", + lnum, c->ltab[lnum - c->lpt_first].free, + c->ltab[lnum - c->lpt_first].dirty, free, dirty); + ubifs_assert(lnum >= c->lpt_first && lnum <= c->lpt_last); + c->ltab[lnum - c->lpt_first].free = free; + c->ltab[lnum - c->lpt_first].dirty += dirty; +} + +/** + * alloc_lpt_leb - allocate an LPT LEB that is empty. + * @c: UBIFS file-system description object + * @lnum: LEB number is passed and returned here + * + * This function finds the next empty LEB in the ltab starting from @lnum. If a + * an empty LEB is found it is returned in @lnum and the function returns %0. + * Otherwise the function returns -ENOSPC. Note however, that LPT is designed + * never to run out of space. + */ +static int alloc_lpt_leb(struct ubifs_info *c, int *lnum) +{ + int i, n; + + n = *lnum - c->lpt_first + 1; + for (i = n; i < c->lpt_lebs; i++) { + if (c->ltab[i].tgc || c->ltab[i].cmt) + continue; + if (c->ltab[i].free == c->leb_size) { + c->ltab[i].cmt = 1; + *lnum = i + c->lpt_first; + return 0; + } + } + + for (i = 0; i < n; i++) { + if (c->ltab[i].tgc || c->ltab[i].cmt) + continue; + if (c->ltab[i].free == c->leb_size) { + c->ltab[i].cmt = 1; + *lnum = i + c->lpt_first; + return 0; + } + } + return -ENOSPC; +} + +/** + * layout_cnodes - layout cnodes for commit. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int layout_cnodes(struct ubifs_info *c) +{ + int lnum, offs, len, alen, done_lsave, done_ltab, err; + struct ubifs_cnode *cnode; + + err = dbg_chk_lpt_sz(c, 0, 0); + if (err) + return err; + cnode = c->lpt_cnext; + if (!cnode) + return 0; + lnum = c->nhead_lnum; + offs = c->nhead_offs; + /* Try to place lsave and ltab nicely */ + done_lsave = !c->big_lpt; + done_ltab = 0; + if (!done_lsave && offs + c->lsave_sz <= c->leb_size) { + done_lsave = 1; + c->lsave_lnum = lnum; + c->lsave_offs = offs; + offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); + } + + if (offs + c->ltab_sz <= c->leb_size) { + done_ltab = 1; + c->ltab_lnum = lnum; + c->ltab_offs = offs; + offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); + } + + do { + if (cnode->level) { + len = c->nnode_sz; + c->dirty_nn_cnt -= 1; + } else { + len = c->pnode_sz; + c->dirty_pn_cnt -= 1; + } + while (offs + len > c->leb_size) { + alen = ALIGN(offs, c->min_io_size); + upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); + err = alloc_lpt_leb(c, &lnum); + if (err) + goto no_space; + offs = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + /* Try to place lsave and ltab nicely */ + if (!done_lsave) { + done_lsave = 1; + c->lsave_lnum = lnum; + c->lsave_offs = offs; + offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); + continue; + } + if (!done_ltab) { + done_ltab = 1; + c->ltab_lnum = lnum; + c->ltab_offs = offs; + offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); + continue; + } + break; + } + if (cnode->parent) { + cnode->parent->nbranch[cnode->iip].lnum = lnum; + cnode->parent->nbranch[cnode->iip].offs = offs; + } else { + c->lpt_lnum = lnum; + c->lpt_offs = offs; + } + offs += len; + dbg_chk_lpt_sz(c, 1, len); + cnode = cnode->cnext; + } while (cnode && cnode != c->lpt_cnext); + + /* Make sure to place LPT's save table */ + if (!done_lsave) { + if (offs + c->lsave_sz > c->leb_size) { + alen = ALIGN(offs, c->min_io_size); + upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); + err = alloc_lpt_leb(c, &lnum); + if (err) + goto no_space; + offs = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + } + done_lsave = 1; + c->lsave_lnum = lnum; + c->lsave_offs = offs; + offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); + } + + /* Make sure to place LPT's own lprops table */ + if (!done_ltab) { + if (offs + c->ltab_sz > c->leb_size) { + alen = ALIGN(offs, c->min_io_size); + upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); + err = alloc_lpt_leb(c, &lnum); + if (err) + goto no_space; + offs = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + } + done_ltab = 1; + c->ltab_lnum = lnum; + c->ltab_offs = offs; + offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); + } + + alen = ALIGN(offs, c->min_io_size); + upd_ltab(c, lnum, c->leb_size - alen, alen - offs); + dbg_chk_lpt_sz(c, 4, alen - offs); + err = dbg_chk_lpt_sz(c, 3, alen); + if (err) + return err; + return 0; + +no_space: + ubifs_err("LPT out of space"); + dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, " + "done_lsave %d", lnum, offs, len, done_ltab, done_lsave); + dbg_dump_lpt_info(c); + dbg_dump_lpt_lebs(c); + dump_stack(); + return err; +} + +/** + * realloc_lpt_leb - allocate an LPT LEB that is empty. + * @c: UBIFS file-system description object + * @lnum: LEB number is passed and returned here + * + * This function duplicates exactly the results of the function alloc_lpt_leb. + * It is used during end commit to reallocate the same LEB numbers that were + * allocated by alloc_lpt_leb during start commit. + * + * This function finds the next LEB that was allocated by the alloc_lpt_leb + * function starting from @lnum. If a LEB is found it is returned in @lnum and + * the function returns %0. Otherwise the function returns -ENOSPC. + * Note however, that LPT is designed never to run out of space. + */ +static int realloc_lpt_leb(struct ubifs_info *c, int *lnum) +{ + int i, n; + + n = *lnum - c->lpt_first + 1; + for (i = n; i < c->lpt_lebs; i++) + if (c->ltab[i].cmt) { + c->ltab[i].cmt = 0; + *lnum = i + c->lpt_first; + return 0; + } + + for (i = 0; i < n; i++) + if (c->ltab[i].cmt) { + c->ltab[i].cmt = 0; + *lnum = i + c->lpt_first; + return 0; + } + return -ENOSPC; +} + +/** + * write_cnodes - write cnodes for commit. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int write_cnodes(struct ubifs_info *c) +{ + int lnum, offs, len, from, err, wlen, alen, done_ltab, done_lsave; + struct ubifs_cnode *cnode; + void *buf = c->lpt_buf; + + cnode = c->lpt_cnext; + if (!cnode) + return 0; + lnum = c->nhead_lnum; + offs = c->nhead_offs; + from = offs; + /* Ensure empty LEB is unmapped */ + if (offs == 0) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + /* Try to place lsave and ltab nicely */ + done_lsave = !c->big_lpt; + done_ltab = 0; + if (!done_lsave && offs + c->lsave_sz <= c->leb_size) { + done_lsave = 1; + ubifs_pack_lsave(c, buf + offs, c->lsave); + offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); + } + + if (offs + c->ltab_sz <= c->leb_size) { + done_ltab = 1; + ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); + offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); + } + + /* Loop for each cnode */ + do { + if (cnode->level) + len = c->nnode_sz; + else + len = c->pnode_sz; + while (offs + len > c->leb_size) { + wlen = offs - from; + if (wlen) { + alen = ALIGN(wlen, c->min_io_size); + memset(buf + offs, 0xff, alen - wlen); + err = ubifs_leb_write(c, lnum, buf + from, from, + alen, UBI_SHORTTERM); + if (err) + return err; + } + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); + err = realloc_lpt_leb(c, &lnum); + if (err) + goto no_space; + offs = from = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + /* Try to place lsave and ltab nicely */ + if (!done_lsave) { + done_lsave = 1; + ubifs_pack_lsave(c, buf + offs, c->lsave); + offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); + continue; + } + if (!done_ltab) { + done_ltab = 1; + ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); + offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); + continue; + } + break; + } + if (cnode->level) + ubifs_pack_nnode(c, buf + offs, + (struct ubifs_nnode *)cnode); + else + ubifs_pack_pnode(c, buf + offs, + (struct ubifs_pnode *)cnode); + /* + * The reason for the barriers is the same as in case of TNC. + * See comment in 'write_index()'. 'dirty_cow_nnode()' and + * 'dirty_cow_pnode()' are the functions for which this is + * important. + */ + clear_bit(DIRTY_CNODE, &cnode->flags); + smp_mb__before_clear_bit(); + clear_bit(COW_ZNODE, &cnode->flags); + smp_mb__after_clear_bit(); + offs += len; + dbg_chk_lpt_sz(c, 1, len); + cnode = cnode->cnext; + } while (cnode && cnode != c->lpt_cnext); + + /* Make sure to place LPT's save table */ + if (!done_lsave) { + if (offs + c->lsave_sz > c->leb_size) { + wlen = offs - from; + alen = ALIGN(wlen, c->min_io_size); + memset(buf + offs, 0xff, alen - wlen); + err = ubifs_leb_write(c, lnum, buf + from, from, alen, + UBI_SHORTTERM); + if (err) + return err; + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); + err = realloc_lpt_leb(c, &lnum); + if (err) + goto no_space; + offs = from = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + done_lsave = 1; + ubifs_pack_lsave(c, buf + offs, c->lsave); + offs += c->lsave_sz; + dbg_chk_lpt_sz(c, 1, c->lsave_sz); + } + + /* Make sure to place LPT's own lprops table */ + if (!done_ltab) { + if (offs + c->ltab_sz > c->leb_size) { + wlen = offs - from; + alen = ALIGN(wlen, c->min_io_size); + memset(buf + offs, 0xff, alen - wlen); + err = ubifs_leb_write(c, lnum, buf + from, from, alen, + UBI_SHORTTERM); + if (err) + return err; + dbg_chk_lpt_sz(c, 2, c->leb_size - offs); + err = realloc_lpt_leb(c, &lnum); + if (err) + goto no_space; + offs = from = 0; + ubifs_assert(lnum >= c->lpt_first && + lnum <= c->lpt_last); + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + done_ltab = 1; + ubifs_pack_ltab(c, buf + offs, c->ltab_cmt); + offs += c->ltab_sz; + dbg_chk_lpt_sz(c, 1, c->ltab_sz); + } + + /* Write remaining data in buffer */ + wlen = offs - from; + alen = ALIGN(wlen, c->min_io_size); + memset(buf + offs, 0xff, alen - wlen); + err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM); + if (err) + return err; + + dbg_chk_lpt_sz(c, 4, alen - wlen); + err = dbg_chk_lpt_sz(c, 3, ALIGN(offs, c->min_io_size)); + if (err) + return err; + + c->nhead_lnum = lnum; + c->nhead_offs = ALIGN(offs, c->min_io_size); + + dbg_lp("LPT root is at %d:%d", c->lpt_lnum, c->lpt_offs); + dbg_lp("LPT head is at %d:%d", c->nhead_lnum, c->nhead_offs); + dbg_lp("LPT ltab is at %d:%d", c->ltab_lnum, c->ltab_offs); + if (c->big_lpt) + dbg_lp("LPT lsave is at %d:%d", c->lsave_lnum, c->lsave_offs); + + return 0; + +no_space: + ubifs_err("LPT out of space mismatch"); + dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab " + "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave); + dbg_dump_lpt_info(c); + dbg_dump_lpt_lebs(c); + dump_stack(); + return err; +} + +/** + * next_pnode_to_dirty - find next pnode to dirty. + * @c: UBIFS file-system description object + * @pnode: pnode + * + * This function returns the next pnode to dirty or %NULL if there are no more + * pnodes. Note that pnodes that have never been written (lnum == 0) are + * skipped. + */ +static struct ubifs_pnode *next_pnode_to_dirty(struct ubifs_info *c, + struct ubifs_pnode *pnode) +{ + struct ubifs_nnode *nnode; + int iip; + + /* Try to go right */ + nnode = pnode->parent; + for (iip = pnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) { + if (nnode->nbranch[iip].lnum) + return ubifs_get_pnode(c, nnode, iip); + } + + /* Go up while can't go right */ + do { + iip = nnode->iip + 1; + nnode = nnode->parent; + if (!nnode) + return NULL; + for (; iip < UBIFS_LPT_FANOUT; iip++) { + if (nnode->nbranch[iip].lnum) + break; + } + } while (iip >= UBIFS_LPT_FANOUT); + + /* Go right */ + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return (void *)nnode; + + /* Go down to level 1 */ + while (nnode->level > 1) { + for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) { + if (nnode->nbranch[iip].lnum) + break; + } + if (iip >= UBIFS_LPT_FANOUT) { + /* + * Should not happen, but we need to keep going + * if it does. + */ + iip = 0; + } + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return (void *)nnode; + } + + for (iip = 0; iip < UBIFS_LPT_FANOUT; iip++) + if (nnode->nbranch[iip].lnum) + break; + if (iip >= UBIFS_LPT_FANOUT) + /* Should not happen, but we need to keep going if it does */ + iip = 0; + return ubifs_get_pnode(c, nnode, iip); +} + +/** + * pnode_lookup - lookup a pnode in the LPT. + * @c: UBIFS file-system description object + * @i: pnode number (0 to main_lebs - 1) + * + * This function returns a pointer to the pnode on success or a negative + * error code on failure. + */ +static struct ubifs_pnode *pnode_lookup(struct ubifs_info *c, int i) +{ + int err, h, iip, shft; + struct ubifs_nnode *nnode; + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return ERR_PTR(err); + } + i <<= UBIFS_LPT_FANOUT_SHIFT; + nnode = c->nroot; + shft = c->lpt_hght * UBIFS_LPT_FANOUT_SHIFT; + for (h = 1; h < c->lpt_hght; h++) { + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + shft -= UBIFS_LPT_FANOUT_SHIFT; + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return ERR_PTR(PTR_ERR(nnode)); + } + iip = ((i >> shft) & (UBIFS_LPT_FANOUT - 1)); + return ubifs_get_pnode(c, nnode, iip); +} + +/** + * add_pnode_dirt - add dirty space to LPT LEB properties. + * @c: UBIFS file-system description object + * @pnode: pnode for which to add dirt + */ +static void add_pnode_dirt(struct ubifs_info *c, struct ubifs_pnode *pnode) +{ + ubifs_add_lpt_dirt(c, pnode->parent->nbranch[pnode->iip].lnum, + c->pnode_sz); +} + +/** + * do_make_pnode_dirty - mark a pnode dirty. + * @c: UBIFS file-system description object + * @pnode: pnode to mark dirty + */ +static void do_make_pnode_dirty(struct ubifs_info *c, struct ubifs_pnode *pnode) +{ + /* Assumes cnext list is empty i.e. not called during commit */ + if (!test_and_set_bit(DIRTY_CNODE, &pnode->flags)) { + struct ubifs_nnode *nnode; + + c->dirty_pn_cnt += 1; + add_pnode_dirt(c, pnode); + /* Mark parent and ancestors dirty too */ + nnode = pnode->parent; + while (nnode) { + if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) { + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + nnode = nnode->parent; + } else + break; + } + } +} + +/** + * make_tree_dirty - mark the entire LEB properties tree dirty. + * @c: UBIFS file-system description object + * + * This function is used by the "small" LPT model to cause the entire LEB + * properties tree to be written. The "small" LPT model does not use LPT + * garbage collection because it is more efficient to write the entire tree + * (because it is small). + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_tree_dirty(struct ubifs_info *c) +{ + struct ubifs_pnode *pnode; + + pnode = pnode_lookup(c, 0); + while (pnode) { + do_make_pnode_dirty(c, pnode); + pnode = next_pnode_to_dirty(c, pnode); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + } + return 0; +} + +/** + * need_write_all - determine if the LPT area is running out of free space. + * @c: UBIFS file-system description object + * + * This function returns %1 if the LPT area is running out of free space and %0 + * if it is not. + */ +static int need_write_all(struct ubifs_info *c) +{ + long long free = 0; + int i; + + for (i = 0; i < c->lpt_lebs; i++) { + if (i + c->lpt_first == c->nhead_lnum) + free += c->leb_size - c->nhead_offs; + else if (c->ltab[i].free == c->leb_size) + free += c->leb_size; + else if (c->ltab[i].free + c->ltab[i].dirty == c->leb_size) + free += c->leb_size; + } + /* Less than twice the size left */ + if (free <= c->lpt_sz * 2) + return 1; + return 0; +} + +/** + * lpt_tgc_start - start trivial garbage collection of LPT LEBs. + * @c: UBIFS file-system description object + * + * LPT trivial garbage collection is where a LPT LEB contains only dirty and + * free space and so may be reused as soon as the next commit is completed. + * This function is called during start commit to mark LPT LEBs for trivial GC. + */ +static void lpt_tgc_start(struct ubifs_info *c) +{ + int i; + + for (i = 0; i < c->lpt_lebs; i++) { + if (i + c->lpt_first == c->nhead_lnum) + continue; + if (c->ltab[i].dirty > 0 && + c->ltab[i].free + c->ltab[i].dirty == c->leb_size) { + c->ltab[i].tgc = 1; + c->ltab[i].free = c->leb_size; + c->ltab[i].dirty = 0; + dbg_lp("LEB %d", i + c->lpt_first); + } + } +} + +/** + * lpt_tgc_end - end trivial garbage collection of LPT LEBs. + * @c: UBIFS file-system description object + * + * LPT trivial garbage collection is where a LPT LEB contains only dirty and + * free space and so may be reused as soon as the next commit is completed. + * This function is called after the commit is completed (master node has been + * written) and un-maps LPT LEBs that were marked for trivial GC. + */ +static int lpt_tgc_end(struct ubifs_info *c) +{ + int i, err; + + for (i = 0; i < c->lpt_lebs; i++) + if (c->ltab[i].tgc) { + err = ubifs_leb_unmap(c, i + c->lpt_first); + if (err) + return err; + c->ltab[i].tgc = 0; + dbg_lp("LEB %d", i + c->lpt_first); + } + return 0; +} + +/** + * populate_lsave - fill the lsave array with important LEB numbers. + * @c: the UBIFS file-system description object + * + * This function is only called for the "big" model. It records a small number + * of LEB numbers of important LEBs. Important LEBs are ones that are (from + * most important to least important): empty, freeable, freeable index, dirty + * index, dirty or free. Upon mount, we read this list of LEB numbers and bring + * their pnodes into memory. That will stop us from having to scan the LPT + * straight away. For the "small" model we assume that scanning the LPT is no + * big deal. + */ +static void populate_lsave(struct ubifs_info *c) +{ + struct ubifs_lprops *lprops; + struct ubifs_lpt_heap *heap; + int i, cnt = 0; + + ubifs_assert(c->big_lpt); + if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) { + c->lpt_drty_flgs |= LSAVE_DIRTY; + ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz); + } + list_for_each_entry(lprops, &c->empty_list, list) { + c->lsave[cnt++] = lprops->lnum; + if (cnt >= c->lsave_cnt) + return; + } + list_for_each_entry(lprops, &c->freeable_list, list) { + c->lsave[cnt++] = lprops->lnum; + if (cnt >= c->lsave_cnt) + return; + } + list_for_each_entry(lprops, &c->frdi_idx_list, list) { + c->lsave[cnt++] = lprops->lnum; + if (cnt >= c->lsave_cnt) + return; + } + heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1]; + for (i = 0; i < heap->cnt; i++) { + c->lsave[cnt++] = heap->arr[i]->lnum; + if (cnt >= c->lsave_cnt) + return; + } + heap = &c->lpt_heap[LPROPS_DIRTY - 1]; + for (i = 0; i < heap->cnt; i++) { + c->lsave[cnt++] = heap->arr[i]->lnum; + if (cnt >= c->lsave_cnt) + return; + } + heap = &c->lpt_heap[LPROPS_FREE - 1]; + for (i = 0; i < heap->cnt; i++) { + c->lsave[cnt++] = heap->arr[i]->lnum; + if (cnt >= c->lsave_cnt) + return; + } + /* Fill it up completely */ + while (cnt < c->lsave_cnt) + c->lsave[cnt++] = c->main_first; +} + +/** + * nnode_lookup - lookup a nnode in the LPT. + * @c: UBIFS file-system description object + * @i: nnode number + * + * This function returns a pointer to the nnode on success or a negative + * error code on failure. + */ +static struct ubifs_nnode *nnode_lookup(struct ubifs_info *c, int i) +{ + int err, iip; + struct ubifs_nnode *nnode; + + if (!c->nroot) { + err = ubifs_read_nnode(c, NULL, 0); + if (err) + return ERR_PTR(err); + } + nnode = c->nroot; + while (1) { + iip = i & (UBIFS_LPT_FANOUT - 1); + i >>= UBIFS_LPT_FANOUT_SHIFT; + if (!i) + break; + nnode = ubifs_get_nnode(c, nnode, iip); + if (IS_ERR(nnode)) + return nnode; + } + return nnode; +} + +/** + * make_nnode_dirty - find a nnode and, if found, make it dirty. + * @c: UBIFS file-system description object + * @node_num: nnode number of nnode to make dirty + * @lnum: LEB number where nnode was written + * @offs: offset where nnode was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_nnode_dirty(struct ubifs_info *c, int node_num, int lnum, + int offs) +{ + struct ubifs_nnode *nnode; + + nnode = nnode_lookup(c, node_num); + if (IS_ERR(nnode)) + return PTR_ERR(nnode); + if (nnode->parent) { + struct ubifs_nbranch *branch; + + branch = &nnode->parent->nbranch[nnode->iip]; + if (branch->lnum != lnum || branch->offs != offs) + return 0; /* nnode is obsolete */ + } else if (c->lpt_lnum != lnum || c->lpt_offs != offs) + return 0; /* nnode is obsolete */ + /* Assumes cnext list is empty i.e. not called during commit */ + if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) { + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + /* Mark parent and ancestors dirty too */ + nnode = nnode->parent; + while (nnode) { + if (!test_and_set_bit(DIRTY_CNODE, &nnode->flags)) { + c->dirty_nn_cnt += 1; + ubifs_add_nnode_dirt(c, nnode); + nnode = nnode->parent; + } else + break; + } + } + return 0; +} + +/** + * make_pnode_dirty - find a pnode and, if found, make it dirty. + * @c: UBIFS file-system description object + * @node_num: pnode number of pnode to make dirty + * @lnum: LEB number where pnode was written + * @offs: offset where pnode was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_pnode_dirty(struct ubifs_info *c, int node_num, int lnum, + int offs) +{ + struct ubifs_pnode *pnode; + struct ubifs_nbranch *branch; + + pnode = pnode_lookup(c, node_num); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + branch = &pnode->parent->nbranch[pnode->iip]; + if (branch->lnum != lnum || branch->offs != offs) + return 0; + do_make_pnode_dirty(c, pnode); + return 0; +} + +/** + * make_ltab_dirty - make ltab node dirty. + * @c: UBIFS file-system description object + * @lnum: LEB number where ltab was written + * @offs: offset where ltab was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_ltab_dirty(struct ubifs_info *c, int lnum, int offs) +{ + if (lnum != c->ltab_lnum || offs != c->ltab_offs) + return 0; /* This ltab node is obsolete */ + if (!(c->lpt_drty_flgs & LTAB_DIRTY)) { + c->lpt_drty_flgs |= LTAB_DIRTY; + ubifs_add_lpt_dirt(c, c->ltab_lnum, c->ltab_sz); + } + return 0; +} + +/** + * make_lsave_dirty - make lsave node dirty. + * @c: UBIFS file-system description object + * @lnum: LEB number where lsave was written + * @offs: offset where lsave was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_lsave_dirty(struct ubifs_info *c, int lnum, int offs) +{ + if (lnum != c->lsave_lnum || offs != c->lsave_offs) + return 0; /* This lsave node is obsolete */ + if (!(c->lpt_drty_flgs & LSAVE_DIRTY)) { + c->lpt_drty_flgs |= LSAVE_DIRTY; + ubifs_add_lpt_dirt(c, c->lsave_lnum, c->lsave_sz); + } + return 0; +} + +/** + * make_node_dirty - make node dirty. + * @c: UBIFS file-system description object + * @node_type: LPT node type + * @node_num: node number + * @lnum: LEB number where node was written + * @offs: offset where node was written + * + * This function is used by LPT garbage collection. LPT garbage collection is + * used only for the "big" LPT model (c->big_lpt == 1). Garbage collection + * simply involves marking all the nodes in the LEB being garbage-collected as + * dirty. The dirty nodes are written next commit, after which the LEB is free + * to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num, + int lnum, int offs) +{ + switch (node_type) { + case UBIFS_LPT_NNODE: + return make_nnode_dirty(c, node_num, lnum, offs); + case UBIFS_LPT_PNODE: + return make_pnode_dirty(c, node_num, lnum, offs); + case UBIFS_LPT_LTAB: + return make_ltab_dirty(c, lnum, offs); + case UBIFS_LPT_LSAVE: + return make_lsave_dirty(c, lnum, offs); + } + return -EINVAL; +} + +/** + * get_lpt_node_len - return the length of a node based on its type. + * @c: UBIFS file-system description object + * @node_type: LPT node type + */ +static int get_lpt_node_len(const struct ubifs_info *c, int node_type) +{ + switch (node_type) { + case UBIFS_LPT_NNODE: + return c->nnode_sz; + case UBIFS_LPT_PNODE: + return c->pnode_sz; + case UBIFS_LPT_LTAB: + return c->ltab_sz; + case UBIFS_LPT_LSAVE: + return c->lsave_sz; + } + return 0; +} + +/** + * get_pad_len - return the length of padding in a buffer. + * @c: UBIFS file-system description object + * @buf: buffer + * @len: length of buffer + */ +static int get_pad_len(const struct ubifs_info *c, uint8_t *buf, int len) +{ + int offs, pad_len; + + if (c->min_io_size == 1) + return 0; + offs = c->leb_size - len; + pad_len = ALIGN(offs, c->min_io_size) - offs; + return pad_len; +} + +/** + * get_lpt_node_type - return type (and node number) of a node in a buffer. + * @c: UBIFS file-system description object + * @buf: buffer + * @node_num: node number is returned here + */ +static int get_lpt_node_type(const struct ubifs_info *c, uint8_t *buf, + int *node_num) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int pos = 0, node_type; + + node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS); + *node_num = ubifs_unpack_bits(&addr, &pos, c->pcnt_bits); + return node_type; +} + +/** + * is_a_node - determine if a buffer contains a node. + * @c: UBIFS file-system description object + * @buf: buffer + * @len: length of buffer + * + * This function returns %1 if the buffer contains a node or %0 if it does not. + */ +static int is_a_node(const struct ubifs_info *c, uint8_t *buf, int len) +{ + uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; + int pos = 0, node_type, node_len; + uint16_t crc, calc_crc; + + if (len < UBIFS_LPT_CRC_BYTES + (UBIFS_LPT_TYPE_BITS + 7) / 8) + return 0; + node_type = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_TYPE_BITS); + if (node_type == UBIFS_LPT_NOT_A_NODE) + return 0; + node_len = get_lpt_node_len(c, node_type); + if (!node_len || node_len > len) + return 0; + pos = 0; + addr = buf; + crc = ubifs_unpack_bits(&addr, &pos, UBIFS_LPT_CRC_BITS); + calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, + node_len - UBIFS_LPT_CRC_BYTES); + if (crc != calc_crc) + return 0; + return 1; +} + +/** + * lpt_gc_lnum - garbage collect a LPT LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number to garbage collect + * + * LPT garbage collection is used only for the "big" LPT model + * (c->big_lpt == 1). Garbage collection simply involves marking all the nodes + * in the LEB being garbage-collected as dirty. The dirty nodes are written + * next commit, after which the LEB is free to be reused. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int lpt_gc_lnum(struct ubifs_info *c, int lnum) +{ + int err, len = c->leb_size, node_type, node_num, node_len, offs; + void *buf = c->lpt_buf; + + dbg_lp("LEB %d", lnum); + err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); + if (err) { + ubifs_err("cannot read LEB %d, error %d", lnum, err); + return err; + } + while (1) { + if (!is_a_node(c, buf, len)) { + int pad_len; + + pad_len = get_pad_len(c, buf, len); + if (pad_len) { + buf += pad_len; + len -= pad_len; + continue; + } + return 0; + } + node_type = get_lpt_node_type(c, buf, &node_num); + node_len = get_lpt_node_len(c, node_type); + offs = c->leb_size - len; + ubifs_assert(node_len != 0); + mutex_lock(&c->lp_mutex); + err = make_node_dirty(c, node_type, node_num, lnum, offs); + mutex_unlock(&c->lp_mutex); + if (err) + return err; + buf += node_len; + len -= node_len; + } + return 0; +} + +/** + * lpt_gc - LPT garbage collection. + * @c: UBIFS file-system description object + * + * Select a LPT LEB for LPT garbage collection and call 'lpt_gc_lnum()'. + * Returns %0 on success and a negative error code on failure. + */ +static int lpt_gc(struct ubifs_info *c) +{ + int i, lnum = -1, dirty = 0; + + mutex_lock(&c->lp_mutex); + for (i = 0; i < c->lpt_lebs; i++) { + ubifs_assert(!c->ltab[i].tgc); + if (i + c->lpt_first == c->nhead_lnum || + c->ltab[i].free + c->ltab[i].dirty == c->leb_size) + continue; + if (c->ltab[i].dirty > dirty) { + dirty = c->ltab[i].dirty; + lnum = i + c->lpt_first; + } + } + mutex_unlock(&c->lp_mutex); + if (lnum == -1) + return -ENOSPC; + return lpt_gc_lnum(c, lnum); +} + +/** + * ubifs_lpt_start_commit - UBIFS commit starts. + * @c: the UBIFS file-system description object + * + * This function has to be called when UBIFS starts the commit operation. + * This function "freezes" all currently dirty LEB properties and does not + * change them anymore. Further changes are saved and tracked separately + * because they are not part of this commit. This function returns zero in case + * of success and a negative error code in case of failure. + */ +int ubifs_lpt_start_commit(struct ubifs_info *c) +{ + int err, cnt; + + dbg_lp(""); + + mutex_lock(&c->lp_mutex); + err = dbg_chk_lpt_free_spc(c); + if (err) + goto out; + err = dbg_check_ltab(c); + if (err) + goto out; + + if (c->check_lpt_free) { + /* + * We ensure there is enough free space in + * ubifs_lpt_post_commit() by marking nodes dirty. That + * information is lost when we unmount, so we also need + * to check free space once after mounting also. + */ + c->check_lpt_free = 0; + while (need_write_all(c)) { + mutex_unlock(&c->lp_mutex); + err = lpt_gc(c); + if (err) + return err; + mutex_lock(&c->lp_mutex); + } + } + + lpt_tgc_start(c); + + if (!c->dirty_pn_cnt) { + dbg_cmt("no cnodes to commit"); + err = 0; + goto out; + } + + if (!c->big_lpt && need_write_all(c)) { + /* If needed, write everything */ + err = make_tree_dirty(c); + if (err) + goto out; + lpt_tgc_start(c); + } + + if (c->big_lpt) + populate_lsave(c); + + cnt = get_cnodes_to_commit(c); + ubifs_assert(cnt != 0); + + err = layout_cnodes(c); + if (err) + goto out; + + /* Copy the LPT's own lprops for end commit to write */ + memcpy(c->ltab_cmt, c->ltab, + sizeof(struct ubifs_lpt_lprops) * c->lpt_lebs); + c->lpt_drty_flgs &= ~(LTAB_DIRTY | LSAVE_DIRTY); + +out: + mutex_unlock(&c->lp_mutex); + return err; +} + +/** + * free_obsolete_cnodes - free obsolete cnodes for commit end. + * @c: UBIFS file-system description object + */ +static void free_obsolete_cnodes(struct ubifs_info *c) +{ + struct ubifs_cnode *cnode, *cnext; + + cnext = c->lpt_cnext; + if (!cnext) + return; + do { + cnode = cnext; + cnext = cnode->cnext; + if (test_bit(OBSOLETE_CNODE, &cnode->flags)) + kfree(cnode); + else + cnode->cnext = NULL; + } while (cnext != c->lpt_cnext); + c->lpt_cnext = NULL; +} + +/** + * ubifs_lpt_end_commit - finish the commit operation. + * @c: the UBIFS file-system description object + * + * This function has to be called when the commit operation finishes. It + * flushes the changes which were "frozen" by 'ubifs_lprops_start_commit()' to + * the media. Returns zero in case of success and a negative error code in case + * of failure. + */ +int ubifs_lpt_end_commit(struct ubifs_info *c) +{ + int err; + + dbg_lp(""); + + if (!c->lpt_cnext) + return 0; + + err = write_cnodes(c); + if (err) + return err; + + mutex_lock(&c->lp_mutex); + free_obsolete_cnodes(c); + mutex_unlock(&c->lp_mutex); + + return 0; +} + +/** + * ubifs_lpt_post_commit - post commit LPT trivial GC and LPT GC. + * @c: UBIFS file-system description object + * + * LPT trivial GC is completed after a commit. Also LPT GC is done after a + * commit for the "big" LPT model. + */ +int ubifs_lpt_post_commit(struct ubifs_info *c) +{ + int err; + + mutex_lock(&c->lp_mutex); + err = lpt_tgc_end(c); + if (err) + goto out; + if (c->big_lpt) + while (need_write_all(c)) { + mutex_unlock(&c->lp_mutex); + err = lpt_gc(c); + if (err) + return err; + mutex_lock(&c->lp_mutex); + } +out: + mutex_unlock(&c->lp_mutex); + return err; +} + +/** + * first_nnode - find the first nnode in memory. + * @c: UBIFS file-system description object + * @hght: height of tree where nnode found is returned here + * + * This function returns a pointer to the nnode found or %NULL if no nnode is + * found. This function is a helper to 'ubifs_lpt_free()'. + */ +static struct ubifs_nnode *first_nnode(struct ubifs_info *c, int *hght) +{ + struct ubifs_nnode *nnode; + int h, i, found; + + nnode = c->nroot; + *hght = 0; + if (!nnode) + return NULL; + for (h = 1; h < c->lpt_hght; h++) { + found = 0; + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + if (nnode->nbranch[i].nnode) { + found = 1; + nnode = nnode->nbranch[i].nnode; + *hght = h; + break; + } + } + if (!found) + break; + } + return nnode; +} + +/** + * next_nnode - find the next nnode in memory. + * @c: UBIFS file-system description object + * @nnode: nnode from which to start. + * @hght: height of tree where nnode is, is passed and returned here + * + * This function returns a pointer to the nnode found or %NULL if no nnode is + * found. This function is a helper to 'ubifs_lpt_free()'. + */ +static struct ubifs_nnode *next_nnode(struct ubifs_info *c, + struct ubifs_nnode *nnode, int *hght) +{ + struct ubifs_nnode *parent; + int iip, h, i, found; + + parent = nnode->parent; + if (!parent) + return NULL; + if (nnode->iip == UBIFS_LPT_FANOUT - 1) { + *hght -= 1; + return parent; + } + for (iip = nnode->iip + 1; iip < UBIFS_LPT_FANOUT; iip++) { + nnode = parent->nbranch[iip].nnode; + if (nnode) + break; + } + if (!nnode) { + *hght -= 1; + return parent; + } + for (h = *hght + 1; h < c->lpt_hght; h++) { + found = 0; + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + if (nnode->nbranch[i].nnode) { + found = 1; + nnode = nnode->nbranch[i].nnode; + *hght = h; + break; + } + } + if (!found) + break; + } + return nnode; +} + +/** + * ubifs_lpt_free - free resources owned by the LPT. + * @c: UBIFS file-system description object + * @wr_only: free only resources used for writing + */ +void ubifs_lpt_free(struct ubifs_info *c, int wr_only) +{ + struct ubifs_nnode *nnode; + int i, hght; + + /* Free write-only things first */ + + free_obsolete_cnodes(c); /* Leftover from a failed commit */ + + vfree(c->ltab_cmt); + c->ltab_cmt = NULL; + vfree(c->lpt_buf); + c->lpt_buf = NULL; + kfree(c->lsave); + c->lsave = NULL; + + if (wr_only) + return; + + /* Now free the rest */ + + nnode = first_nnode(c, &hght); + while (nnode) { + for (i = 0; i < UBIFS_LPT_FANOUT; i++) + kfree(nnode->nbranch[i].nnode); + nnode = next_nnode(c, nnode, &hght); + } + for (i = 0; i < LPROPS_HEAP_CNT; i++) + kfree(c->lpt_heap[i].arr); + kfree(c->dirty_idx.arr); + kfree(c->nroot); + vfree(c->ltab); + kfree(c->lpt_nod_buf); +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +/** + * dbg_is_all_ff - determine if a buffer contains only 0xFF bytes. + * @buf: buffer + * @len: buffer length + */ +static int dbg_is_all_ff(uint8_t *buf, int len) +{ + int i; + + for (i = 0; i < len; i++) + if (buf[i] != 0xff) + return 0; + return 1; +} + +/** + * dbg_is_nnode_dirty - determine if a nnode is dirty. + * @c: the UBIFS file-system description object + * @lnum: LEB number where nnode was written + * @offs: offset where nnode was written + */ +static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs) +{ + struct ubifs_nnode *nnode; + int hght; + + /* Entire tree is in memory so first_nnode / next_nnode are OK */ + nnode = first_nnode(c, &hght); + for (; nnode; nnode = next_nnode(c, nnode, &hght)) { + struct ubifs_nbranch *branch; + + cond_resched(); + if (nnode->parent) { + branch = &nnode->parent->nbranch[nnode->iip]; + if (branch->lnum != lnum || branch->offs != offs) + continue; + if (test_bit(DIRTY_CNODE, &nnode->flags)) + return 1; + return 0; + } else { + if (c->lpt_lnum != lnum || c->lpt_offs != offs) + continue; + if (test_bit(DIRTY_CNODE, &nnode->flags)) + return 1; + return 0; + } + } + return 1; +} + +/** + * dbg_is_pnode_dirty - determine if a pnode is dirty. + * @c: the UBIFS file-system description object + * @lnum: LEB number where pnode was written + * @offs: offset where pnode was written + */ +static int dbg_is_pnode_dirty(struct ubifs_info *c, int lnum, int offs) +{ + int i, cnt; + + cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT); + for (i = 0; i < cnt; i++) { + struct ubifs_pnode *pnode; + struct ubifs_nbranch *branch; + + cond_resched(); + pnode = pnode_lookup(c, i); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + branch = &pnode->parent->nbranch[pnode->iip]; + if (branch->lnum != lnum || branch->offs != offs) + continue; + if (test_bit(DIRTY_CNODE, &pnode->flags)) + return 1; + return 0; + } + return 1; +} + +/** + * dbg_is_ltab_dirty - determine if a ltab node is dirty. + * @c: the UBIFS file-system description object + * @lnum: LEB number where ltab node was written + * @offs: offset where ltab node was written + */ +static int dbg_is_ltab_dirty(struct ubifs_info *c, int lnum, int offs) +{ + if (lnum != c->ltab_lnum || offs != c->ltab_offs) + return 1; + return (c->lpt_drty_flgs & LTAB_DIRTY) != 0; +} + +/** + * dbg_is_lsave_dirty - determine if a lsave node is dirty. + * @c: the UBIFS file-system description object + * @lnum: LEB number where lsave node was written + * @offs: offset where lsave node was written + */ +static int dbg_is_lsave_dirty(struct ubifs_info *c, int lnum, int offs) +{ + if (lnum != c->lsave_lnum || offs != c->lsave_offs) + return 1; + return (c->lpt_drty_flgs & LSAVE_DIRTY) != 0; +} + +/** + * dbg_is_node_dirty - determine if a node is dirty. + * @c: the UBIFS file-system description object + * @node_type: node type + * @lnum: LEB number where node was written + * @offs: offset where node was written + */ +static int dbg_is_node_dirty(struct ubifs_info *c, int node_type, int lnum, + int offs) +{ + switch (node_type) { + case UBIFS_LPT_NNODE: + return dbg_is_nnode_dirty(c, lnum, offs); + case UBIFS_LPT_PNODE: + return dbg_is_pnode_dirty(c, lnum, offs); + case UBIFS_LPT_LTAB: + return dbg_is_ltab_dirty(c, lnum, offs); + case UBIFS_LPT_LSAVE: + return dbg_is_lsave_dirty(c, lnum, offs); + } + return 1; +} + +/** + * dbg_check_ltab_lnum - check the ltab for a LPT LEB number. + * @c: the UBIFS file-system description object + * @lnum: LEB number where node was written + * @offs: offset where node was written + * + * This function returns %0 on success and a negative error code on failure. + */ +static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) +{ + int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len; + int ret; + void *buf = c->dbg->buf; + + if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + return 0; + + dbg_lp("LEB %d", lnum); + err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); + if (err) { + dbg_msg("ubi_read failed, LEB %d, error %d", lnum, err); + return err; + } + while (1) { + if (!is_a_node(c, buf, len)) { + int i, pad_len; + + pad_len = get_pad_len(c, buf, len); + if (pad_len) { + buf += pad_len; + len -= pad_len; + dirty += pad_len; + continue; + } + if (!dbg_is_all_ff(buf, len)) { + dbg_msg("invalid empty space in LEB %d at %d", + lnum, c->leb_size - len); + err = -EINVAL; + } + i = lnum - c->lpt_first; + if (len != c->ltab[i].free) { + dbg_msg("invalid free space in LEB %d " + "(free %d, expected %d)", + lnum, len, c->ltab[i].free); + err = -EINVAL; + } + if (dirty != c->ltab[i].dirty) { + dbg_msg("invalid dirty space in LEB %d " + "(dirty %d, expected %d)", + lnum, dirty, c->ltab[i].dirty); + err = -EINVAL; + } + return err; + } + node_type = get_lpt_node_type(c, buf, &node_num); + node_len = get_lpt_node_len(c, node_type); + ret = dbg_is_node_dirty(c, node_type, lnum, c->leb_size - len); + if (ret == 1) + dirty += node_len; + buf += node_len; + len -= node_len; + } +} + +/** + * dbg_check_ltab - check the free and dirty space in the ltab. + * @c: the UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_check_ltab(struct ubifs_info *c) +{ + int lnum, err, i, cnt; + + if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + return 0; + + /* Bring the entire tree into memory */ + cnt = DIV_ROUND_UP(c->main_lebs, UBIFS_LPT_FANOUT); + for (i = 0; i < cnt; i++) { + struct ubifs_pnode *pnode; + + pnode = pnode_lookup(c, i); + if (IS_ERR(pnode)) + return PTR_ERR(pnode); + cond_resched(); + } + + /* Check nodes */ + err = dbg_check_lpt_nodes(c, (struct ubifs_cnode *)c->nroot, 0, 0); + if (err) + return err; + + /* Check each LEB */ + for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) { + err = dbg_check_ltab_lnum(c, lnum); + if (err) { + dbg_err("failed at LEB %d", lnum); + return err; + } + } + + dbg_lp("succeeded"); + return 0; +} + +/** + * dbg_chk_lpt_free_spc - check LPT free space is enough to write entire LPT. + * @c: the UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +int dbg_chk_lpt_free_spc(struct ubifs_info *c) +{ + long long free = 0; + int i; + + if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + return 0; + + for (i = 0; i < c->lpt_lebs; i++) { + if (c->ltab[i].tgc || c->ltab[i].cmt) + continue; + if (i + c->lpt_first == c->nhead_lnum) + free += c->leb_size - c->nhead_offs; + else if (c->ltab[i].free == c->leb_size) + free += c->leb_size; + } + if (free < c->lpt_sz) { + dbg_err("LPT space error: free %lld lpt_sz %lld", + free, c->lpt_sz); + dbg_dump_lpt_info(c); + dbg_dump_lpt_lebs(c); + dump_stack(); + return -EINVAL; + } + return 0; +} + +/** + * dbg_chk_lpt_sz - check LPT does not write more than LPT size. + * @c: the UBIFS file-system description object + * @action: what to do + * @len: length written + * + * This function returns %0 on success and a negative error code on failure. + * The @action argument may be one of: + * o %0 - LPT debugging checking starts, initialize debugging variables; + * o %1 - wrote an LPT node, increase LPT size by @len bytes; + * o %2 - switched to a different LEB and wasted @len bytes; + * o %3 - check that we've written the right number of bytes. + * o %4 - wasted @len bytes; + */ +int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) +{ + struct ubifs_debug_info *d = c->dbg; + long long chk_lpt_sz, lpt_sz; + int err = 0; + + if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS)) + return 0; + + switch (action) { + case 0: + d->chk_lpt_sz = 0; + d->chk_lpt_sz2 = 0; + d->chk_lpt_lebs = 0; + d->chk_lpt_wastage = 0; + if (c->dirty_pn_cnt > c->pnode_cnt) { + dbg_err("dirty pnodes %d exceed max %d", + c->dirty_pn_cnt, c->pnode_cnt); + err = -EINVAL; + } + if (c->dirty_nn_cnt > c->nnode_cnt) { + dbg_err("dirty nnodes %d exceed max %d", + c->dirty_nn_cnt, c->nnode_cnt); + err = -EINVAL; + } + return err; + case 1: + d->chk_lpt_sz += len; + return 0; + case 2: + d->chk_lpt_sz += len; + d->chk_lpt_wastage += len; + d->chk_lpt_lebs += 1; + return 0; + case 3: + chk_lpt_sz = c->leb_size; + chk_lpt_sz *= d->chk_lpt_lebs; + chk_lpt_sz += len - c->nhead_offs; + if (d->chk_lpt_sz != chk_lpt_sz) { + dbg_err("LPT wrote %lld but space used was %lld", + d->chk_lpt_sz, chk_lpt_sz); + err = -EINVAL; + } + if (d->chk_lpt_sz > c->lpt_sz) { + dbg_err("LPT wrote %lld but lpt_sz is %lld", + d->chk_lpt_sz, c->lpt_sz); + err = -EINVAL; + } + if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) { + dbg_err("LPT layout size %lld but wrote %lld", + d->chk_lpt_sz, d->chk_lpt_sz2); + err = -EINVAL; + } + if (d->chk_lpt_sz2 && d->new_nhead_offs != len) { + dbg_err("LPT new nhead offs: expected %d was %d", + d->new_nhead_offs, len); + err = -EINVAL; + } + lpt_sz = (long long)c->pnode_cnt * c->pnode_sz; + lpt_sz += (long long)c->nnode_cnt * c->nnode_sz; + lpt_sz += c->ltab_sz; + if (c->big_lpt) + lpt_sz += c->lsave_sz; + if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) { + dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld", + d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz); + err = -EINVAL; + } + if (err) { + dbg_dump_lpt_info(c); + dbg_dump_lpt_lebs(c); + dump_stack(); + } + d->chk_lpt_sz2 = d->chk_lpt_sz; + d->chk_lpt_sz = 0; + d->chk_lpt_wastage = 0; + d->chk_lpt_lebs = 0; + d->new_nhead_offs = len; + return err; + case 4: + d->chk_lpt_sz += len; + d->chk_lpt_wastage += len; + return 0; + default: + return -EINVAL; + } +} + +/** + * dbg_dump_lpt_leb - dump an LPT LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number to dump + * + * This function dumps an LEB from LPT area. Nodes in this area are very + * different to nodes in the main area (e.g., they do not have common headers, + * they do not have 8-byte alignments, etc), so we have a separate function to + * dump LPT area LEBs. Note, LPT has to be locked by the caller. + */ +static void dump_lpt_leb(const struct ubifs_info *c, int lnum) +{ + int err, len = c->leb_size, node_type, node_num, node_len, offs; + void *buf = c->dbg->buf; + + printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n", + current->pid, lnum); + err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size); + if (err) { + ubifs_err("cannot read LEB %d, error %d", lnum, err); + return; + } + while (1) { + offs = c->leb_size - len; + if (!is_a_node(c, buf, len)) { + int pad_len; + + pad_len = get_pad_len(c, buf, len); + if (pad_len) { + printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n", + lnum, offs, pad_len); + buf += pad_len; + len -= pad_len; + continue; + } + if (len) + printk(KERN_DEBUG "LEB %d:%d, free %d bytes\n", + lnum, offs, len); + break; + } + + node_type = get_lpt_node_type(c, buf, &node_num); + switch (node_type) { + case UBIFS_LPT_PNODE: + { + node_len = c->pnode_sz; + if (c->big_lpt) + printk(KERN_DEBUG "LEB %d:%d, pnode num %d\n", + lnum, offs, node_num); + else + printk(KERN_DEBUG "LEB %d:%d, pnode\n", + lnum, offs); + break; + } + case UBIFS_LPT_NNODE: + { + int i; + struct ubifs_nnode nnode; + + node_len = c->nnode_sz; + if (c->big_lpt) + printk(KERN_DEBUG "LEB %d:%d, nnode num %d, ", + lnum, offs, node_num); + else + printk(KERN_DEBUG "LEB %d:%d, nnode, ", + lnum, offs); + err = ubifs_unpack_nnode(c, buf, &nnode); + for (i = 0; i < UBIFS_LPT_FANOUT; i++) { + printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum, + nnode.nbranch[i].offs); + if (i != UBIFS_LPT_FANOUT - 1) + printk(KERN_CONT ", "); + } + printk(KERN_CONT "\n"); + break; + } + case UBIFS_LPT_LTAB: + node_len = c->ltab_sz; + printk(KERN_DEBUG "LEB %d:%d, ltab\n", + lnum, offs); + break; + case UBIFS_LPT_LSAVE: + node_len = c->lsave_sz; + printk(KERN_DEBUG "LEB %d:%d, lsave len\n", lnum, offs); + break; + default: + ubifs_err("LPT node type %d not recognized", node_type); + return; + } + + buf += node_len; + len -= node_len; + } + + printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n", + current->pid, lnum); +} + +/** + * dbg_dump_lpt_lebs - dump LPT lebs. + * @c: UBIFS file-system description object + * + * This function dumps all LPT LEBs. The caller has to make sure the LPT is + * locked. + */ +void dbg_dump_lpt_lebs(const struct ubifs_info *c) +{ + int i; + + printk(KERN_DEBUG "(pid %d) start dumping all LPT LEBs\n", + current->pid); + for (i = 0; i < c->lpt_lebs; i++) + dump_lpt_leb(c, i + c->lpt_first); + printk(KERN_DEBUG "(pid %d) finish dumping all LPT LEBs\n", + current->pid); +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff -Nurd linux-2.6.24/fs/ubifs/master.c ubifs-v2.6.24/fs/ubifs/master.c --- linux-2.6.24/fs/ubifs/master.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/master.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,387 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* This file implements reading and writing the master node */ + +#include "ubifs.h" + +/** + * scan_for_master - search the valid master node. + * @c: UBIFS file-system description object + * + * This function scans the master node LEBs and search for the latest master + * node. Returns zero in case of success and a negative error code in case of + * failure. + */ +static int scan_for_master(struct ubifs_info *c) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + int lnum, offs = 0, nodes_cnt; + + lnum = UBIFS_MST_LNUM; + + sleb = ubifs_scan(c, lnum, 0, c->sbuf); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + nodes_cnt = sleb->nodes_cnt; + if (nodes_cnt > 0) { + snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, + list); + if (snod->type != UBIFS_MST_NODE) + goto out; + memcpy(c->mst_node, snod->node, snod->len); + offs = snod->offs; + } + ubifs_scan_destroy(sleb); + + lnum += 1; + + sleb = ubifs_scan(c, lnum, 0, c->sbuf); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + if (sleb->nodes_cnt != nodes_cnt) + goto out; + if (!sleb->nodes_cnt) + goto out; + snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, list); + if (snod->type != UBIFS_MST_NODE) + goto out; + if (snod->offs != offs) + goto out; + if (memcmp((void *)c->mst_node + UBIFS_CH_SZ, + (void *)snod->node + UBIFS_CH_SZ, + UBIFS_MST_NODE_SZ - UBIFS_CH_SZ)) + goto out; + c->mst_offs = offs; + ubifs_scan_destroy(sleb); + return 0; + +out: + ubifs_scan_destroy(sleb); + return -EINVAL; +} + +/** + * validate_master - validate master node. + * @c: UBIFS file-system description object + * + * This function validates data which was read from master node. Returns zero + * if the data is all right and %-EINVAL if not. + */ +static int validate_master(const struct ubifs_info *c) +{ + long long main_sz; + int err; + + if (c->max_sqnum >= SQNUM_WATERMARK) { + err = 1; + goto out; + } + + if (c->cmt_no >= c->max_sqnum) { + err = 2; + goto out; + } + + if (c->highest_inum >= INUM_WATERMARK) { + err = 3; + goto out; + } + + if (c->lhead_lnum < UBIFS_LOG_LNUM || + c->lhead_lnum >= UBIFS_LOG_LNUM + c->log_lebs || + c->lhead_offs < 0 || c->lhead_offs >= c->leb_size || + c->lhead_offs & (c->min_io_size - 1)) { + err = 4; + goto out; + } + + if (c->zroot.lnum >= c->leb_cnt || c->zroot.lnum < c->main_first || + c->zroot.offs >= c->leb_size || c->zroot.offs & 7) { + err = 5; + goto out; + } + + if (c->zroot.len < c->ranges[UBIFS_IDX_NODE].min_len || + c->zroot.len > c->ranges[UBIFS_IDX_NODE].max_len) { + err = 6; + goto out; + } + + if (c->gc_lnum >= c->leb_cnt || c->gc_lnum < c->main_first) { + err = 7; + goto out; + } + + if (c->ihead_lnum >= c->leb_cnt || c->ihead_lnum < c->main_first || + c->ihead_offs % c->min_io_size || c->ihead_offs < 0 || + c->ihead_offs > c->leb_size || c->ihead_offs & 7) { + err = 8; + goto out; + } + + main_sz = (long long)c->main_lebs * c->leb_size; + if (c->old_idx_sz & 7 || c->old_idx_sz >= main_sz) { + err = 9; + goto out; + } + + if (c->lpt_lnum < c->lpt_first || c->lpt_lnum > c->lpt_last || + c->lpt_offs < 0 || c->lpt_offs + c->nnode_sz > c->leb_size) { + err = 10; + goto out; + } + + if (c->nhead_lnum < c->lpt_first || c->nhead_lnum > c->lpt_last || + c->nhead_offs < 0 || c->nhead_offs % c->min_io_size || + c->nhead_offs > c->leb_size) { + err = 11; + goto out; + } + + if (c->ltab_lnum < c->lpt_first || c->ltab_lnum > c->lpt_last || + c->ltab_offs < 0 || + c->ltab_offs + c->ltab_sz > c->leb_size) { + err = 12; + goto out; + } + + if (c->big_lpt && (c->lsave_lnum < c->lpt_first || + c->lsave_lnum > c->lpt_last || c->lsave_offs < 0 || + c->lsave_offs + c->lsave_sz > c->leb_size)) { + err = 13; + goto out; + } + + if (c->lscan_lnum < c->main_first || c->lscan_lnum >= c->leb_cnt) { + err = 14; + goto out; + } + + if (c->lst.empty_lebs < 0 || c->lst.empty_lebs > c->main_lebs - 2) { + err = 15; + goto out; + } + + if (c->lst.idx_lebs < 0 || c->lst.idx_lebs > c->main_lebs - 1) { + err = 16; + goto out; + } + + if (c->lst.total_free < 0 || c->lst.total_free > main_sz || + c->lst.total_free & 7) { + err = 17; + goto out; + } + + if (c->lst.total_dirty < 0 || (c->lst.total_dirty & 7)) { + err = 18; + goto out; + } + + if (c->lst.total_used < 0 || (c->lst.total_used & 7)) { + err = 19; + goto out; + } + + if (c->lst.total_free + c->lst.total_dirty + + c->lst.total_used > main_sz) { + err = 20; + goto out; + } + + if (c->lst.total_dead + c->lst.total_dark + + c->lst.total_used + c->old_idx_sz > main_sz) { + err = 21; + goto out; + } + + if (c->lst.total_dead < 0 || + c->lst.total_dead > c->lst.total_free + c->lst.total_dirty || + c->lst.total_dead & 7) { + err = 22; + goto out; + } + + if (c->lst.total_dark < 0 || + c->lst.total_dark > c->lst.total_free + c->lst.total_dirty || + c->lst.total_dark & 7) { + err = 23; + goto out; + } + + return 0; + +out: + ubifs_err("bad master node at offset %d error %d", c->mst_offs, err); + dbg_dump_node(c, c->mst_node); + return -EINVAL; +} + +/** + * ubifs_read_master - read master node. + * @c: UBIFS file-system description object + * + * This function finds and reads the master node during file-system mount. If + * the flash is empty, it creates default master node as well. Returns zero in + * case of success and a negative error code in case of failure. + */ +int ubifs_read_master(struct ubifs_info *c) +{ + int err, old_leb_cnt; + + c->mst_node = kzalloc(c->mst_node_alsz, GFP_KERNEL); + if (!c->mst_node) + return -ENOMEM; + + err = scan_for_master(c); + if (err) { + err = ubifs_recover_master_node(c); + if (err) + /* + * Note, we do not free 'c->mst_node' here because the + * unmount routine will take care of this. + */ + return err; + } + + /* Make sure that the recovery flag is clear */ + c->mst_node->flags &= cpu_to_le32(~UBIFS_MST_RCVRY); + + c->max_sqnum = le64_to_cpu(c->mst_node->ch.sqnum); + c->highest_inum = le64_to_cpu(c->mst_node->highest_inum); + c->cmt_no = le64_to_cpu(c->mst_node->cmt_no); + c->zroot.lnum = le32_to_cpu(c->mst_node->root_lnum); + c->zroot.offs = le32_to_cpu(c->mst_node->root_offs); + c->zroot.len = le32_to_cpu(c->mst_node->root_len); + c->lhead_lnum = le32_to_cpu(c->mst_node->log_lnum); + c->gc_lnum = le32_to_cpu(c->mst_node->gc_lnum); + c->ihead_lnum = le32_to_cpu(c->mst_node->ihead_lnum); + c->ihead_offs = le32_to_cpu(c->mst_node->ihead_offs); + c->old_idx_sz = le64_to_cpu(c->mst_node->index_size); + c->lpt_lnum = le32_to_cpu(c->mst_node->lpt_lnum); + c->lpt_offs = le32_to_cpu(c->mst_node->lpt_offs); + c->nhead_lnum = le32_to_cpu(c->mst_node->nhead_lnum); + c->nhead_offs = le32_to_cpu(c->mst_node->nhead_offs); + c->ltab_lnum = le32_to_cpu(c->mst_node->ltab_lnum); + c->ltab_offs = le32_to_cpu(c->mst_node->ltab_offs); + c->lsave_lnum = le32_to_cpu(c->mst_node->lsave_lnum); + c->lsave_offs = le32_to_cpu(c->mst_node->lsave_offs); + c->lscan_lnum = le32_to_cpu(c->mst_node->lscan_lnum); + c->lst.empty_lebs = le32_to_cpu(c->mst_node->empty_lebs); + c->lst.idx_lebs = le32_to_cpu(c->mst_node->idx_lebs); + old_leb_cnt = le32_to_cpu(c->mst_node->leb_cnt); + c->lst.total_free = le64_to_cpu(c->mst_node->total_free); + c->lst.total_dirty = le64_to_cpu(c->mst_node->total_dirty); + c->lst.total_used = le64_to_cpu(c->mst_node->total_used); + c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead); + c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark); + + c->calc_idx_sz = c->old_idx_sz; + + if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS)) + c->no_orphs = 1; + + if (old_leb_cnt != c->leb_cnt) { + /* The file system has been resized */ + int growth = c->leb_cnt - old_leb_cnt; + + if (c->leb_cnt < old_leb_cnt || + c->leb_cnt < UBIFS_MIN_LEB_CNT) { + ubifs_err("bad leb_cnt on master node"); + dbg_dump_node(c, c->mst_node); + return -EINVAL; + } + + dbg_mnt("Auto resizing (master) from %d LEBs to %d LEBs", + old_leb_cnt, c->leb_cnt); + c->lst.empty_lebs += growth; + c->lst.total_free += growth * (long long)c->leb_size; + c->lst.total_dark += growth * (long long)c->dark_wm; + + /* + * Reflect changes back onto the master node. N.B. the master + * node gets written immediately whenever mounting (or + * remounting) in read-write mode, so we do not need to write it + * here. + */ + c->mst_node->leb_cnt = cpu_to_le32(c->leb_cnt); + c->mst_node->empty_lebs = cpu_to_le32(c->lst.empty_lebs); + c->mst_node->total_free = cpu_to_le64(c->lst.total_free); + c->mst_node->total_dark = cpu_to_le64(c->lst.total_dark); + } + + err = validate_master(c); + if (err) + return err; + + err = dbg_old_index_check_init(c, &c->zroot); + + return err; +} + +/** + * ubifs_write_master - write master node. + * @c: UBIFS file-system description object + * + * This function writes the master node. The caller has to take the + * @c->mst_mutex lock before calling this function. Returns zero in case of + * success and a negative error code in case of failure. The master node is + * written twice to enable recovery. + */ +int ubifs_write_master(struct ubifs_info *c) +{ + int err, lnum, offs, len; + + if (c->ro_media) + return -EROFS; + + lnum = UBIFS_MST_LNUM; + offs = c->mst_offs + c->mst_node_alsz; + len = UBIFS_MST_NODE_SZ; + + if (offs + UBIFS_MST_NODE_SZ > c->leb_size) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + offs = 0; + } + + c->mst_offs = offs; + c->mst_node->highest_inum = cpu_to_le64(c->highest_inum); + + err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM); + if (err) + return err; + + lnum += 1; + + if (offs == 0) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM); + + return err; +} diff -Nurd linux-2.6.24/fs/ubifs/misc.h ubifs-v2.6.24/fs/ubifs/misc.h --- linux-2.6.24/fs/ubifs/misc.h 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/misc.h 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,340 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file contains miscellaneous helper functions. + */ + +#ifndef __UBIFS_MISC_H__ +#define __UBIFS_MISC_H__ + +/** + * ubifs_zn_dirty - check if znode is dirty. + * @znode: znode to check + * + * This helper function returns %1 if @znode is dirty and %0 otherwise. + */ +static inline int ubifs_zn_dirty(const struct ubifs_znode *znode) +{ + return !!test_bit(DIRTY_ZNODE, &znode->flags); +} + +/** + * ubifs_wake_up_bgt - wake up background thread. + * @c: UBIFS file-system description object + */ +static inline void ubifs_wake_up_bgt(struct ubifs_info *c) +{ + if (c->bgt && !c->need_bgt) { + c->need_bgt = 1; + wake_up_process(c->bgt); + } +} + +/** + * ubifs_tnc_find_child - find next child in znode. + * @znode: znode to search at + * @start: the zbranch index to start at + * + * This helper function looks for znode child starting at index @start. Returns + * the child or %NULL if no children were found. + */ +static inline struct ubifs_znode * +ubifs_tnc_find_child(struct ubifs_znode *znode, int start) +{ + while (start < znode->child_cnt) { + if (znode->zbranch[start].znode) + return znode->zbranch[start].znode; + start += 1; + } + + return NULL; +} + +/** + * ubifs_inode - get UBIFS inode information by VFS 'struct inode' object. + * @inode: the VFS 'struct inode' pointer + */ +static inline struct ubifs_inode *ubifs_inode(const struct inode *inode) +{ + return container_of(inode, struct ubifs_inode, vfs_inode); +} + +/** + * ubifs_compr_present - check if compressor was compiled in. + * @compr_type: compressor type to check + * + * This function returns %1 of compressor of type @compr_type is present, and + * %0 if not. + */ +static inline int ubifs_compr_present(int compr_type) +{ + ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT); + return !!ubifs_compressors[compr_type]->capi_name; +} + +/** + * ubifs_compr_name - get compressor name string by its type. + * @compr_type: compressor type + * + * This function returns compressor type string. + */ +static inline const char *ubifs_compr_name(int compr_type) +{ + ubifs_assert(compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT); + return ubifs_compressors[compr_type]->name; +} + +/** + * ubifs_wbuf_sync - synchronize write-buffer. + * @wbuf: write-buffer to synchronize + * + * This is the same as as 'ubifs_wbuf_sync_nolock()' but it does not assume + * that the write-buffer is already locked. + */ +static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf) +{ + int err; + + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + err = ubifs_wbuf_sync_nolock(wbuf); + mutex_unlock(&wbuf->io_mutex); + return err; +} + +/** + * ubifs_leb_unmap - unmap an LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number to unmap + * + * This function returns %0 on success and a negative error code on failure. + */ +static inline int ubifs_leb_unmap(const struct ubifs_info *c, int lnum) +{ + int err; + + if (c->ro_media) + return -EROFS; + err = ubi_leb_unmap(c->ubi, lnum); + if (err) { + ubifs_err("unmap LEB %d failed, error %d", lnum, err); + return err; + } + + return 0; +} + +/** + * ubifs_leb_write - write to a LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number to write + * @buf: buffer to write from + * @offs: offset within LEB to write to + * @len: length to write + * @dtype: data type + * + * This function returns %0 on success and a negative error code on failure. + */ +static inline int ubifs_leb_write(const struct ubifs_info *c, int lnum, + const void *buf, int offs, int len, int dtype) +{ + int err; + + if (c->ro_media) + return -EROFS; + err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype); + if (err) { + ubifs_err("writing %d bytes at %d:%d, error %d", + len, lnum, offs, err); + return err; + } + + return 0; +} + +/** + * ubifs_leb_change - atomic LEB change. + * @c: UBIFS file-system description object + * @lnum: LEB number to write + * @buf: buffer to write from + * @len: length to write + * @dtype: data type + * + * This function returns %0 on success and a negative error code on failure. + */ +static inline int ubifs_leb_change(const struct ubifs_info *c, int lnum, + const void *buf, int len, int dtype) +{ + int err; + + if (c->ro_media) + return -EROFS; + err = ubi_leb_change(c->ubi, lnum, buf, len, dtype); + if (err) { + ubifs_err("changing %d bytes in LEB %d, error %d", + len, lnum, err); + return err; + } + + return 0; +} + +/** + * ubifs_encode_dev - encode device node IDs. + * @dev: UBIFS device node information + * @rdev: device IDs to encode + * + * This is a helper function which encodes major/minor numbers of a device node + * into UBIFS device node description. We use standard Linux "new" and "huge" + * encodings. + */ +static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev) +{ + if (new_valid_dev(rdev)) { + dev->new = cpu_to_le32(new_encode_dev(rdev)); + return sizeof(dev->new); + } else { + dev->huge = cpu_to_le64(huge_encode_dev(rdev)); + return sizeof(dev->huge); + } +} + +/** + * ubifs_add_dirt - add dirty space to LEB properties. + * @c: the UBIFS file-system description object + * @lnum: LEB to add dirty space for + * @dirty: dirty space to add + * + * This is a helper function which increased amount of dirty LEB space. Returns + * zero in case of success and a negative error code in case of failure. + */ +static inline int ubifs_add_dirt(struct ubifs_info *c, int lnum, int dirty) +{ + return ubifs_update_one_lp(c, lnum, LPROPS_NC, dirty, 0, 0); +} + +/** + * ubifs_return_leb - return LEB to lprops. + * @c: the UBIFS file-system description object + * @lnum: LEB to return + * + * This helper function cleans the "taken" flag of a logical eraseblock in the + * lprops. Returns zero in case of success and a negative error code in case of + * failure. + */ +static inline int ubifs_return_leb(struct ubifs_info *c, int lnum) +{ + return ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_TAKEN, 0); +} + +/** + * ubifs_idx_node_sz - return index node size. + * @c: the UBIFS file-system description object + * @child_cnt: number of children of this index node + */ +static inline int ubifs_idx_node_sz(const struct ubifs_info *c, int child_cnt) +{ + return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len) * child_cnt; +} + +/** + * ubifs_idx_branch - return pointer to an index branch. + * @c: the UBIFS file-system description object + * @idx: index node + * @bnum: branch number + */ +static inline +struct ubifs_branch *ubifs_idx_branch(const struct ubifs_info *c, + const struct ubifs_idx_node *idx, + int bnum) +{ + return (struct ubifs_branch *)((void *)idx->branches + + (UBIFS_BRANCH_SZ + c->key_len) * bnum); +} + +/** + * ubifs_idx_key - return pointer to an index key. + * @c: the UBIFS file-system description object + * @idx: index node + */ +static inline void *ubifs_idx_key(const struct ubifs_info *c, + const struct ubifs_idx_node *idx) +{ + return (void *)((struct ubifs_branch *)idx->branches)->key; +} + +/** + * ubifs_current_time - round current time to time granularity. + * @inode: inode + */ +static inline struct timespec ubifs_current_time(struct inode *inode) +{ + return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? + current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; +} + +/** + * ubifs_tnc_lookup - look up a file-system node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * + * This function look up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. + */ +static inline int ubifs_tnc_lookup(struct ubifs_info *c, + const union ubifs_key *key, void *node) +{ + return ubifs_tnc_locate(c, key, node, NULL, NULL); +} + +/** + * ubifs_get_lprops - get reference to LEB properties. + * @c: the UBIFS file-system description object + * + * This function locks lprops. Lprops have to be unlocked by + * 'ubifs_release_lprops()'. + */ +static inline void ubifs_get_lprops(struct ubifs_info *c) +{ + mutex_lock(&c->lp_mutex); +} + +/** + * ubifs_release_lprops - release lprops lock. + * @c: the UBIFS file-system description object + * + * This function has to be called after each 'ubifs_get_lprops()' call to + * unlock lprops. + */ +static inline void ubifs_release_lprops(struct ubifs_info *c) +{ + ubifs_assert(mutex_is_locked(&c->lp_mutex)); + ubifs_assert(c->lst.empty_lebs >= 0 && + c->lst.empty_lebs <= c->main_lebs); + mutex_unlock(&c->lp_mutex); +} + +#endif /* __UBIFS_MISC_H__ */ diff -Nurd linux-2.6.24/fs/ubifs/orphan.c ubifs-v2.6.24/fs/ubifs/orphan.c --- linux-2.6.24/fs/ubifs/orphan.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/orphan.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,962 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Author: Adrian Hunter + */ + +#include "ubifs.h" + +/* + * An orphan is an inode number whose inode node has been committed to the index + * with a link count of zero. That happens when an open file is deleted + * (unlinked) and then a commit is run. In the normal course of events the inode + * would be deleted when the file is closed. However in the case of an unclean + * unmount, orphans need to be accounted for. After an unclean unmount, the + * orphans' inodes must be deleted which means either scanning the entire index + * looking for them, or keeping a list on flash somewhere. This unit implements + * the latter approach. + * + * The orphan area is a fixed number of LEBs situated between the LPT area and + * the main area. The number of orphan area LEBs is specified when the file + * system is created. The minimum number is 1. The size of the orphan area + * should be so that it can hold the maximum number of orphans that are expected + * to ever exist at one time. + * + * The number of orphans that can fit in a LEB is: + * + * (c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64) + * + * For example: a 15872 byte LEB can fit 1980 orphans so 1 LEB may be enough. + * + * Orphans are accumulated in a rb-tree. When an inode's link count drops to + * zero, the inode number is added to the rb-tree. It is removed from the tree + * when the inode is deleted. Any new orphans that are in the orphan tree when + * the commit is run, are written to the orphan area in 1 or more orphan nodes. + * If the orphan area is full, it is consolidated to make space. There is + * always enough space because validation prevents the user from creating more + * than the maximum number of orphans allowed. + */ + +#ifdef CONFIG_UBIFS_FS_DEBUG +static int dbg_check_orphans(struct ubifs_info *c); +#else +#define dbg_check_orphans(c) 0 +#endif + +/** + * ubifs_add_orphan - add an orphan. + * @c: UBIFS file-system description object + * @inum: orphan inode number + * + * Add an orphan. This function is called when an inodes link count drops to + * zero. + */ +int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) +{ + struct ubifs_orphan *orphan, *o; + struct rb_node **p, *parent = NULL; + + orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_NOFS); + if (!orphan) + return -ENOMEM; + orphan->inum = inum; + orphan->new = 1; + + spin_lock(&c->orphan_lock); + if (c->tot_orphans >= c->max_orphans) { + spin_unlock(&c->orphan_lock); + kfree(orphan); + return -ENFILE; + } + p = &c->orph_tree.rb_node; + while (*p) { + parent = *p; + o = rb_entry(parent, struct ubifs_orphan, rb); + if (inum < o->inum) + p = &(*p)->rb_left; + else if (inum > o->inum) + p = &(*p)->rb_right; + else { + dbg_err("orphaned twice"); + spin_unlock(&c->orphan_lock); + kfree(orphan); + return 0; + } + } + c->tot_orphans += 1; + c->new_orphans += 1; + rb_link_node(&orphan->rb, parent, p); + rb_insert_color(&orphan->rb, &c->orph_tree); + list_add_tail(&orphan->list, &c->orph_list); + list_add_tail(&orphan->new_list, &c->orph_new); + spin_unlock(&c->orphan_lock); + dbg_gen("ino %lu", (unsigned long)inum); + return 0; +} + +/** + * ubifs_delete_orphan - delete an orphan. + * @c: UBIFS file-system description object + * @inum: orphan inode number + * + * Delete an orphan. This function is called when an inode is deleted. + */ +void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum) +{ + struct ubifs_orphan *o; + struct rb_node *p; + + spin_lock(&c->orphan_lock); + p = c->orph_tree.rb_node; + while (p) { + o = rb_entry(p, struct ubifs_orphan, rb); + if (inum < o->inum) + p = p->rb_left; + else if (inum > o->inum) + p = p->rb_right; + else { + if (o->dnext) { + spin_unlock(&c->orphan_lock); + dbg_gen("deleted twice ino %lu", + (unsigned long)inum); + return; + } + if (o->cnext) { + o->dnext = c->orph_dnext; + c->orph_dnext = o; + spin_unlock(&c->orphan_lock); + dbg_gen("delete later ino %lu", + (unsigned long)inum); + return; + } + rb_erase(p, &c->orph_tree); + list_del(&o->list); + c->tot_orphans -= 1; + if (o->new) { + list_del(&o->new_list); + c->new_orphans -= 1; + } + spin_unlock(&c->orphan_lock); + kfree(o); + dbg_gen("inum %lu", (unsigned long)inum); + return; + } + } + spin_unlock(&c->orphan_lock); + dbg_err("missing orphan ino %lu", (unsigned long)inum); + dbg_dump_stack(); +} + +/** + * ubifs_orphan_start_commit - start commit of orphans. + * @c: UBIFS file-system description object + * + * Start commit of orphans. + */ +int ubifs_orphan_start_commit(struct ubifs_info *c) +{ + struct ubifs_orphan *orphan, **last; + + spin_lock(&c->orphan_lock); + last = &c->orph_cnext; + list_for_each_entry(orphan, &c->orph_new, new_list) { + ubifs_assert(orphan->new); + orphan->new = 0; + *last = orphan; + last = &orphan->cnext; + } + *last = orphan->cnext; + c->cmt_orphans = c->new_orphans; + c->new_orphans = 0; + dbg_cmt("%d orphans to commit", c->cmt_orphans); + INIT_LIST_HEAD(&c->orph_new); + if (c->tot_orphans == 0) + c->no_orphs = 1; + else + c->no_orphs = 0; + spin_unlock(&c->orphan_lock); + return 0; +} + +/** + * avail_orphs - calculate available space. + * @c: UBIFS file-system description object + * + * This function returns the number of orphans that can be written in the + * available space. + */ +static int avail_orphs(struct ubifs_info *c) +{ + int avail_lebs, avail, gap; + + avail_lebs = c->orph_lebs - (c->ohead_lnum - c->orph_first) - 1; + avail = avail_lebs * + ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64)); + gap = c->leb_size - c->ohead_offs; + if (gap >= UBIFS_ORPH_NODE_SZ + sizeof(__le64)) + avail += (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64); + return avail; +} + +/** + * tot_avail_orphs - calculate total space. + * @c: UBIFS file-system description object + * + * This function returns the number of orphans that can be written in half + * the total space. That leaves half the space for adding new orphans. + */ +static int tot_avail_orphs(struct ubifs_info *c) +{ + int avail_lebs, avail; + + avail_lebs = c->orph_lebs; + avail = avail_lebs * + ((c->leb_size - UBIFS_ORPH_NODE_SZ) / sizeof(__le64)); + return avail / 2; +} + +/** + * do_write_orph_node - write a node to the orphan head. + * @c: UBIFS file-system description object + * @len: length of node + * @atomic: write atomically + * + * This function writes a node to the orphan head from the orphan buffer. If + * %atomic is not zero, then the write is done atomically. On success, %0 is + * returned, otherwise a negative error code is returned. + */ +static int do_write_orph_node(struct ubifs_info *c, int len, int atomic) +{ + int err = 0; + + if (atomic) { + ubifs_assert(c->ohead_offs == 0); + ubifs_prepare_node(c, c->orph_buf, len, 1); + len = ALIGN(len, c->min_io_size); + err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len, + UBI_SHORTTERM); + } else { + if (c->ohead_offs == 0) { + /* Ensure LEB has been unmapped */ + err = ubifs_leb_unmap(c, c->ohead_lnum); + if (err) + return err; + } + err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum, + c->ohead_offs, UBI_SHORTTERM); + } + return err; +} + +/** + * write_orph_node - write an orphan node. + * @c: UBIFS file-system description object + * @atomic: write atomically + * + * This function builds an orphan node from the cnext list and writes it to the + * orphan head. On success, %0 is returned, otherwise a negative error code + * is returned. + */ +static int write_orph_node(struct ubifs_info *c, int atomic) +{ + struct ubifs_orphan *orphan, *cnext; + struct ubifs_orph_node *orph; + int gap, err, len, cnt, i; + + ubifs_assert(c->cmt_orphans > 0); + gap = c->leb_size - c->ohead_offs; + if (gap < UBIFS_ORPH_NODE_SZ + sizeof(__le64)) { + c->ohead_lnum += 1; + c->ohead_offs = 0; + gap = c->leb_size; + if (c->ohead_lnum > c->orph_last) { + /* + * We limit the number of orphans so that this should + * never happen. + */ + ubifs_err("out of space in orphan area"); + return -EINVAL; + } + } + cnt = (gap - UBIFS_ORPH_NODE_SZ) / sizeof(__le64); + if (cnt > c->cmt_orphans) + cnt = c->cmt_orphans; + len = UBIFS_ORPH_NODE_SZ + cnt * sizeof(__le64); + ubifs_assert(c->orph_buf); + orph = c->orph_buf; + orph->ch.node_type = UBIFS_ORPH_NODE; + spin_lock(&c->orphan_lock); + cnext = c->orph_cnext; + for (i = 0; i < cnt; i++) { + orphan = cnext; + orph->inos[i] = cpu_to_le64(orphan->inum); + cnext = orphan->cnext; + orphan->cnext = NULL; + } + c->orph_cnext = cnext; + c->cmt_orphans -= cnt; + spin_unlock(&c->orphan_lock); + if (c->cmt_orphans) + orph->cmt_no = cpu_to_le64(c->cmt_no); + else + /* Mark the last node of the commit */ + orph->cmt_no = cpu_to_le64((c->cmt_no) | (1ULL << 63)); + ubifs_assert(c->ohead_offs + len <= c->leb_size); + ubifs_assert(c->ohead_lnum >= c->orph_first); + ubifs_assert(c->ohead_lnum <= c->orph_last); + err = do_write_orph_node(c, len, atomic); + c->ohead_offs += ALIGN(len, c->min_io_size); + c->ohead_offs = ALIGN(c->ohead_offs, 8); + return err; +} + +/** + * write_orph_nodes - write orphan nodes until there are no more to commit. + * @c: UBIFS file-system description object + * @atomic: write atomically + * + * This function writes orphan nodes for all the orphans to commit. On success, + * %0 is returned, otherwise a negative error code is returned. + */ +static int write_orph_nodes(struct ubifs_info *c, int atomic) +{ + int err; + + while (c->cmt_orphans > 0) { + err = write_orph_node(c, atomic); + if (err) + return err; + } + if (atomic) { + int lnum; + + /* Unmap any unused LEBs after consolidation */ + lnum = c->ohead_lnum + 1; + for (lnum = c->ohead_lnum + 1; lnum <= c->orph_last; lnum++) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + } + return 0; +} + +/** + * consolidate - consolidate the orphan area. + * @c: UBIFS file-system description object + * + * This function enables consolidation by putting all the orphans into the list + * to commit. The list is in the order that the orphans were added, and the + * LEBs are written atomically in order, so at no time can orphans be lost by + * an unclean unmount. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int consolidate(struct ubifs_info *c) +{ + int tot_avail = tot_avail_orphs(c), err = 0; + + spin_lock(&c->orphan_lock); + dbg_cmt("there is space for %d orphans and there are %d", + tot_avail, c->tot_orphans); + if (c->tot_orphans - c->new_orphans <= tot_avail) { + struct ubifs_orphan *orphan, **last; + int cnt = 0; + + /* Change the cnext list to include all non-new orphans */ + last = &c->orph_cnext; + list_for_each_entry(orphan, &c->orph_list, list) { + if (orphan->new) + continue; + *last = orphan; + last = &orphan->cnext; + cnt += 1; + } + *last = orphan->cnext; + ubifs_assert(cnt == c->tot_orphans - c->new_orphans); + c->cmt_orphans = cnt; + c->ohead_lnum = c->orph_first; + c->ohead_offs = 0; + } else { + /* + * We limit the number of orphans so that this should + * never happen. + */ + ubifs_err("out of space in orphan area"); + err = -EINVAL; + } + spin_unlock(&c->orphan_lock); + return err; +} + +/** + * commit_orphans - commit orphans. + * @c: UBIFS file-system description object + * + * This function commits orphans to flash. On success, %0 is returned, + * otherwise a negative error code is returned. + */ +static int commit_orphans(struct ubifs_info *c) +{ + int avail, atomic = 0, err; + + ubifs_assert(c->cmt_orphans > 0); + avail = avail_orphs(c); + if (avail < c->cmt_orphans) { + /* Not enough space to write new orphans, so consolidate */ + err = consolidate(c); + if (err) + return err; + atomic = 1; + } + err = write_orph_nodes(c, atomic); + return err; +} + +/** + * erase_deleted - erase the orphans marked for deletion. + * @c: UBIFS file-system description object + * + * During commit, the orphans being committed cannot be deleted, so they are + * marked for deletion and deleted by this function. Also, the recovery + * adds killed orphans to the deletion list, and therefore they are deleted + * here too. + */ +static void erase_deleted(struct ubifs_info *c) +{ + struct ubifs_orphan *orphan, *dnext; + + spin_lock(&c->orphan_lock); + dnext = c->orph_dnext; + while (dnext) { + orphan = dnext; + dnext = orphan->dnext; + ubifs_assert(!orphan->new); + rb_erase(&orphan->rb, &c->orph_tree); + list_del(&orphan->list); + c->tot_orphans -= 1; + dbg_gen("deleting orphan ino %lu", (unsigned long)orphan->inum); + kfree(orphan); + } + c->orph_dnext = NULL; + spin_unlock(&c->orphan_lock); +} + +/** + * ubifs_orphan_end_commit - end commit of orphans. + * @c: UBIFS file-system description object + * + * End commit of orphans. + */ +int ubifs_orphan_end_commit(struct ubifs_info *c) +{ + int err; + + if (c->cmt_orphans != 0) { + err = commit_orphans(c); + if (err) + return err; + } + erase_deleted(c); + err = dbg_check_orphans(c); + return err; +} + +/** + * ubifs_clear_orphans - erase all LEBs used for orphans. + * @c: UBIFS file-system description object + * + * If recovery is not required, then the orphans from the previous session + * are not needed. This function locates the LEBs used to record + * orphans, and un-maps them. + */ +int ubifs_clear_orphans(struct ubifs_info *c) +{ + int lnum, err; + + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } + c->ohead_lnum = c->orph_first; + c->ohead_offs = 0; + return 0; +} + +/** + * insert_dead_orphan - insert an orphan. + * @c: UBIFS file-system description object + * @inum: orphan inode number + * + * This function is a helper to the 'do_kill_orphans()' function. The orphan + * must be kept until the next commit, so it is added to the rb-tree and the + * deletion list. + */ +static int insert_dead_orphan(struct ubifs_info *c, ino_t inum) +{ + struct ubifs_orphan *orphan, *o; + struct rb_node **p, *parent = NULL; + + orphan = kzalloc(sizeof(struct ubifs_orphan), GFP_KERNEL); + if (!orphan) + return -ENOMEM; + orphan->inum = inum; + + p = &c->orph_tree.rb_node; + while (*p) { + parent = *p; + o = rb_entry(parent, struct ubifs_orphan, rb); + if (inum < o->inum) + p = &(*p)->rb_left; + else if (inum > o->inum) + p = &(*p)->rb_right; + else { + /* Already added - no problem */ + kfree(orphan); + return 0; + } + } + c->tot_orphans += 1; + rb_link_node(&orphan->rb, parent, p); + rb_insert_color(&orphan->rb, &c->orph_tree); + list_add_tail(&orphan->list, &c->orph_list); + orphan->dnext = c->orph_dnext; + c->orph_dnext = orphan; + dbg_mnt("ino %lu, new %d, tot %d", (unsigned long)inum, + c->new_orphans, c->tot_orphans); + return 0; +} + +/** + * do_kill_orphans - remove orphan inodes from the index. + * @c: UBIFS file-system description object + * @sleb: scanned LEB + * @last_cmt_no: cmt_no of last orphan node read is passed and returned here + * @outofdate: whether the LEB is out of date is returned here + * @last_flagged: whether the end orphan node is encountered + * + * This function is a helper to the 'kill_orphans()' function. It goes through + * every orphan node in a LEB and for every inode number recorded, removes + * all keys for that inode from the TNC. + */ +static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + unsigned long long *last_cmt_no, int *outofdate, + int *last_flagged) +{ + struct ubifs_scan_node *snod; + struct ubifs_orph_node *orph; + unsigned long long cmt_no; + ino_t inum; + int i, n, err, first = 1; + + list_for_each_entry(snod, &sleb->nodes, list) { + if (snod->type != UBIFS_ORPH_NODE) { + ubifs_err("invalid node type %d in orphan area at " + "%d:%d", snod->type, sleb->lnum, snod->offs); + dbg_dump_node(c, snod->node); + return -EINVAL; + } + + orph = snod->node; + + /* Check commit number */ + cmt_no = le64_to_cpu(orph->cmt_no) & LLONG_MAX; + /* + * The commit number on the master node may be less, because + * of a failed commit. If there are several failed commits in a + * row, the commit number written on orphan nodes will continue + * to increase (because the commit number is adjusted here) even + * though the commit number on the master node stays the same + * because the master node has not been re-written. + */ + if (cmt_no > c->cmt_no) + c->cmt_no = cmt_no; + if (cmt_no < *last_cmt_no && *last_flagged) { + /* + * The last orphan node had a higher commit number and + * was flagged as the last written for that commit + * number. That makes this orphan node, out of date. + */ + if (!first) { + ubifs_err("out of order commit number %llu in " + "orphan node at %d:%d", + cmt_no, sleb->lnum, snod->offs); + dbg_dump_node(c, snod->node); + return -EINVAL; + } + dbg_rcvry("out of date LEB %d", sleb->lnum); + *outofdate = 1; + return 0; + } + + if (first) + first = 0; + + n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3; + for (i = 0; i < n; i++) { + inum = le64_to_cpu(orph->inos[i]); + dbg_rcvry("deleting orphaned inode %lu", + (unsigned long)inum); + err = ubifs_tnc_remove_ino(c, inum); + if (err) + return err; + err = insert_dead_orphan(c, inum); + if (err) + return err; + } + + *last_cmt_no = cmt_no; + if (le64_to_cpu(orph->cmt_no) & (1ULL << 63)) { + dbg_rcvry("last orph node for commit %llu at %d:%d", + cmt_no, sleb->lnum, snod->offs); + *last_flagged = 1; + } else + *last_flagged = 0; + } + + return 0; +} + +/** + * kill_orphans - remove all orphan inodes from the index. + * @c: UBIFS file-system description object + * + * If recovery is required, then orphan inodes recorded during the previous + * session (which ended with an unclean unmount) must be deleted from the index. + * This is done by updating the TNC, but since the index is not updated until + * the next commit, the LEBs where the orphan information is recorded are not + * erased until the next commit. + */ +static int kill_orphans(struct ubifs_info *c) +{ + unsigned long long last_cmt_no = 0; + int lnum, err = 0, outofdate = 0, last_flagged = 0; + + c->ohead_lnum = c->orph_first; + c->ohead_offs = 0; + /* Check no-orphans flag and skip this if no orphans */ + if (c->no_orphs) { + dbg_rcvry("no orphans"); + return 0; + } + /* + * Orph nodes always start at c->orph_first and are written to each + * successive LEB in turn. Generally unused LEBs will have been unmapped + * but may contain out of date orphan nodes if the unmap didn't go + * through. In addition, the last orphan node written for each commit is + * marked (top bit of orph->cmt_no is set to 1). It is possible that + * there are orphan nodes from the next commit (i.e. the commit did not + * complete successfully). In that case, no orphans will have been lost + * due to the way that orphans are written, and any orphans added will + * be valid orphans anyway and so can be deleted. + */ + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { + struct ubifs_scan_leb *sleb; + + dbg_rcvry("LEB %d", lnum); + sleb = ubifs_scan(c, lnum, 0, c->sbuf); + if (IS_ERR(sleb)) { + sleb = ubifs_recover_leb(c, lnum, 0, c->sbuf, 0); + if (IS_ERR(sleb)) { + err = PTR_ERR(sleb); + break; + } + } + err = do_kill_orphans(c, sleb, &last_cmt_no, &outofdate, + &last_flagged); + if (err || outofdate) { + ubifs_scan_destroy(sleb); + break; + } + if (sleb->endpt) { + c->ohead_lnum = lnum; + c->ohead_offs = sleb->endpt; + } + ubifs_scan_destroy(sleb); + } + return err; +} + +/** + * ubifs_mount_orphans - delete orphan inodes and erase LEBs that recorded them. + * @c: UBIFS file-system description object + * @unclean: indicates recovery from unclean unmount + * @read_only: indicates read only mount + * + * This function is called when mounting to erase orphans from the previous + * session. If UBIFS was not unmounted cleanly, then the inodes recorded as + * orphans are deleted. + */ +int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only) +{ + int err = 0; + + c->max_orphans = tot_avail_orphs(c); + + if (!read_only) { + c->orph_buf = vmalloc(c->leb_size); + if (!c->orph_buf) + return -ENOMEM; + } + + if (unclean) + err = kill_orphans(c); + else if (!read_only) + err = ubifs_clear_orphans(c); + + return err; +} + +#ifdef CONFIG_UBIFS_FS_DEBUG + +struct check_orphan { + struct rb_node rb; + ino_t inum; +}; + +struct check_info { + unsigned long last_ino; + unsigned long tot_inos; + unsigned long missing; + unsigned long long leaf_cnt; + struct ubifs_ino_node *node; + struct rb_root root; +}; + +static int dbg_find_orphan(struct ubifs_info *c, ino_t inum) +{ + struct ubifs_orphan *o; + struct rb_node *p; + + spin_lock(&c->orphan_lock); + p = c->orph_tree.rb_node; + while (p) { + o = rb_entry(p, struct ubifs_orphan, rb); + if (inum < o->inum) + p = p->rb_left; + else if (inum > o->inum) + p = p->rb_right; + else { + spin_unlock(&c->orphan_lock); + return 1; + } + } + spin_unlock(&c->orphan_lock); + return 0; +} + +static int dbg_ins_check_orphan(struct rb_root *root, ino_t inum) +{ + struct check_orphan *orphan, *o; + struct rb_node **p, *parent = NULL; + + orphan = kzalloc(sizeof(struct check_orphan), GFP_NOFS); + if (!orphan) + return -ENOMEM; + orphan->inum = inum; + + p = &root->rb_node; + while (*p) { + parent = *p; + o = rb_entry(parent, struct check_orphan, rb); + if (inum < o->inum) + p = &(*p)->rb_left; + else if (inum > o->inum) + p = &(*p)->rb_right; + else { + kfree(orphan); + return 0; + } + } + rb_link_node(&orphan->rb, parent, p); + rb_insert_color(&orphan->rb, root); + return 0; +} + +static int dbg_find_check_orphan(struct rb_root *root, ino_t inum) +{ + struct check_orphan *o; + struct rb_node *p; + + p = root->rb_node; + while (p) { + o = rb_entry(p, struct check_orphan, rb); + if (inum < o->inum) + p = p->rb_left; + else if (inum > o->inum) + p = p->rb_right; + else + return 1; + } + return 0; +} + +static void dbg_free_check_tree(struct rb_root *root) +{ + struct rb_node *this = root->rb_node; + struct check_orphan *o; + + while (this) { + if (this->rb_left) { + this = this->rb_left; + continue; + } else if (this->rb_right) { + this = this->rb_right; + continue; + } + o = rb_entry(this, struct check_orphan, rb); + this = rb_parent(this); + if (this) { + if (this->rb_left == &o->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + kfree(o); + } +} + +static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *priv) +{ + struct check_info *ci = priv; + ino_t inum; + int err; + + inum = key_inum(c, &zbr->key); + if (inum != ci->last_ino) { + /* Lowest node type is the inode node, so it comes first */ + if (key_type(c, &zbr->key) != UBIFS_INO_KEY) + ubifs_err("found orphan node ino %lu, type %d", + (unsigned long)inum, key_type(c, &zbr->key)); + ci->last_ino = inum; + ci->tot_inos += 1; + err = ubifs_tnc_read_node(c, zbr, ci->node); + if (err) { + ubifs_err("node read failed, error %d", err); + return err; + } + if (ci->node->nlink == 0) + /* Must be recorded as an orphan */ + if (!dbg_find_check_orphan(&ci->root, inum) && + !dbg_find_orphan(c, inum)) { + ubifs_err("missing orphan, ino %lu", + (unsigned long)inum); + ci->missing += 1; + } + } + ci->leaf_cnt += 1; + return 0; +} + +static int dbg_read_orphans(struct check_info *ci, struct ubifs_scan_leb *sleb) +{ + struct ubifs_scan_node *snod; + struct ubifs_orph_node *orph; + ino_t inum; + int i, n, err; + + list_for_each_entry(snod, &sleb->nodes, list) { + cond_resched(); + if (snod->type != UBIFS_ORPH_NODE) + continue; + orph = snod->node; + n = (le32_to_cpu(orph->ch.len) - UBIFS_ORPH_NODE_SZ) >> 3; + for (i = 0; i < n; i++) { + inum = le64_to_cpu(orph->inos[i]); + err = dbg_ins_check_orphan(&ci->root, inum); + if (err) + return err; + } + } + return 0; +} + +static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) +{ + int lnum, err = 0; + + /* Check no-orphans flag and skip this if no orphans */ + if (c->no_orphs) + return 0; + + for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) { + struct ubifs_scan_leb *sleb; + + sleb = ubifs_scan(c, lnum, 0, c->dbg->buf); + if (IS_ERR(sleb)) { + err = PTR_ERR(sleb); + break; + } + + err = dbg_read_orphans(ci, sleb); + ubifs_scan_destroy(sleb); + if (err) + break; + } + + return err; +} + +static int dbg_check_orphans(struct ubifs_info *c) +{ + struct check_info ci; + int err; + + if (!(ubifs_chk_flags & UBIFS_CHK_ORPH)) + return 0; + + ci.last_ino = 0; + ci.tot_inos = 0; + ci.missing = 0; + ci.leaf_cnt = 0; + ci.root = RB_ROOT; + ci.node = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS); + if (!ci.node) { + ubifs_err("out of memory"); + return -ENOMEM; + } + + err = dbg_scan_orphans(c, &ci); + if (err) + goto out; + + err = dbg_walk_index(c, &dbg_orphan_check, NULL, &ci); + if (err) { + ubifs_err("cannot scan TNC, error %d", err); + goto out; + } + + if (ci.missing) { + ubifs_err("%lu missing orphan(s)", ci.missing); + err = -EINVAL; + goto out; + } + + dbg_cmt("last inode number is %lu", ci.last_ino); + dbg_cmt("total number of inodes is %lu", ci.tot_inos); + dbg_cmt("total number of leaf nodes is %llu", ci.leaf_cnt); + +out: + dbg_free_check_tree(&ci.root); + kfree(ci.node); + return err; +} + +#endif /* CONFIG_UBIFS_FS_DEBUG */ diff -Nurd linux-2.6.24/fs/ubifs/recovery.c ubifs-v2.6.24/fs/ubifs/recovery.c --- linux-2.6.24/fs/ubifs/recovery.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/recovery.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,1496 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements functions needed to recover from unclean un-mounts. + * When UBIFS is mounted, it checks a flag on the master node to determine if + * an un-mount was completed sucessfully. If not, the process of mounting + * incorparates additional checking and fixing of on-flash data structures. + * UBIFS always cleans away all remnants of an unclean un-mount, so that + * errors do not accumulate. However UBIFS defers recovery if it is mounted + * read-only, and the flash is not modified in that case. + */ + +#include +#include "ubifs.h" + +/** + * is_empty - determine whether a buffer is empty (contains all 0xff). + * @buf: buffer to clean + * @len: length of buffer + * + * This function returns %1 if the buffer is empty (contains all 0xff) otherwise + * %0 is returned. + */ +static int is_empty(void *buf, int len) +{ + uint8_t *p = buf; + int i; + + for (i = 0; i < len; i++) + if (*p++ != 0xff) + return 0; + return 1; +} + +/** + * get_master_node - get the last valid master node allowing for corruption. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @pbuf: buffer containing the LEB read, is returned here + * @mst: master node, if found, is returned here + * @cor: corruption, if found, is returned here + * + * This function allocates a buffer, reads the LEB into it, and finds and + * returns the last valid master node allowing for one area of corruption. + * The corrupt area, if there is one, must be consistent with the assumption + * that it is the result of an unclean unmount while the master node was being + * written. Under those circumstances, it is valid to use the previously written + * master node. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int get_master_node(const struct ubifs_info *c, int lnum, void **pbuf, + struct ubifs_mst_node **mst, void **cor) +{ + const int sz = c->mst_node_alsz; + int err, offs, len; + void *sbuf, *buf; + + sbuf = vmalloc(c->leb_size); + if (!sbuf) + return -ENOMEM; + + err = ubi_read(c->ubi, lnum, sbuf, 0, c->leb_size); + if (err && err != -EBADMSG) + goto out_free; + + /* Find the first position that is definitely not a node */ + offs = 0; + buf = sbuf; + len = c->leb_size; + while (offs + UBIFS_MST_NODE_SZ <= c->leb_size) { + struct ubifs_ch *ch = buf; + + if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) + break; + offs += sz; + buf += sz; + len -= sz; + } + /* See if there was a valid master node before that */ + if (offs) { + int ret; + + offs -= sz; + buf -= sz; + len += sz; + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1); + if (ret != SCANNED_A_NODE && offs) { + /* Could have been corruption so check one place back */ + offs -= sz; + buf -= sz; + len += sz; + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1); + if (ret != SCANNED_A_NODE) + /* + * We accept only one area of corruption because + * we are assuming that it was caused while + * trying to write a master node. + */ + goto out_err; + } + if (ret == SCANNED_A_NODE) { + struct ubifs_ch *ch = buf; + + if (ch->node_type != UBIFS_MST_NODE) + goto out_err; + dbg_rcvry("found a master node at %d:%d", lnum, offs); + *mst = buf; + offs += sz; + buf += sz; + len -= sz; + } + } + /* Check for corruption */ + if (offs < c->leb_size) { + if (!is_empty(buf, min_t(int, len, sz))) { + *cor = buf; + dbg_rcvry("found corruption at %d:%d", lnum, offs); + } + offs += sz; + buf += sz; + len -= sz; + } + /* Check remaining empty space */ + if (offs < c->leb_size) + if (!is_empty(buf, len)) + goto out_err; + *pbuf = sbuf; + return 0; + +out_err: + err = -EINVAL; +out_free: + vfree(sbuf); + *mst = NULL; + *cor = NULL; + return err; +} + +/** + * write_rcvrd_mst_node - write recovered master node. + * @c: UBIFS file-system description object + * @mst: master node + * + * This function returns %0 on success and a negative error code on failure. + */ +static int write_rcvrd_mst_node(struct ubifs_info *c, + struct ubifs_mst_node *mst) +{ + int err = 0, lnum = UBIFS_MST_LNUM, sz = c->mst_node_alsz; + __le32 save_flags; + + dbg_rcvry("recovery"); + + save_flags = mst->flags; + mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY); + + ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1); + err = ubi_leb_change(c->ubi, lnum, mst, sz, UBI_SHORTTERM); + if (err) + goto out; + err = ubi_leb_change(c->ubi, lnum + 1, mst, sz, UBI_SHORTTERM); + if (err) + goto out; +out: + mst->flags = save_flags; + return err; +} + +/** + * ubifs_recover_master_node - recover the master node. + * @c: UBIFS file-system description object + * + * This function recovers the master node from corruption that may occur due to + * an unclean unmount. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_recover_master_node(struct ubifs_info *c) +{ + void *buf1 = NULL, *buf2 = NULL, *cor1 = NULL, *cor2 = NULL; + struct ubifs_mst_node *mst1 = NULL, *mst2 = NULL, *mst; + const int sz = c->mst_node_alsz; + int err, offs1, offs2; + + dbg_rcvry("recovery"); + + err = get_master_node(c, UBIFS_MST_LNUM, &buf1, &mst1, &cor1); + if (err) + goto out_free; + + err = get_master_node(c, UBIFS_MST_LNUM + 1, &buf2, &mst2, &cor2); + if (err) + goto out_free; + + if (mst1) { + offs1 = (void *)mst1 - buf1; + if ((le32_to_cpu(mst1->flags) & UBIFS_MST_RCVRY) && + (offs1 == 0 && !cor1)) { + /* + * mst1 was written by recovery at offset 0 with no + * corruption. + */ + dbg_rcvry("recovery recovery"); + mst = mst1; + } else if (mst2) { + offs2 = (void *)mst2 - buf2; + if (offs1 == offs2) { + /* Same offset, so must be the same */ + if (memcmp((void *)mst1 + UBIFS_CH_SZ, + (void *)mst2 + UBIFS_CH_SZ, + UBIFS_MST_NODE_SZ - UBIFS_CH_SZ)) + goto out_err; + mst = mst1; + } else if (offs2 + sz == offs1) { + /* 1st LEB was written, 2nd was not */ + if (cor1) + goto out_err; + mst = mst1; + } else if (offs1 == 0 && offs2 + sz >= c->leb_size) { + /* 1st LEB was unmapped and written, 2nd not */ + if (cor1) + goto out_err; + mst = mst1; + } else + goto out_err; + } else { + /* + * 2nd LEB was unmapped and about to be written, so + * there must be only one master node in the first LEB + * and no corruption. + */ + if (offs1 != 0 || cor1) + goto out_err; + mst = mst1; + } + } else { + if (!mst2) + goto out_err; + /* + * 1st LEB was unmapped and about to be written, so there must + * be no room left in 2nd LEB. + */ + offs2 = (void *)mst2 - buf2; + if (offs2 + sz + sz <= c->leb_size) + goto out_err; + mst = mst2; + } + + dbg_rcvry("recovered master node from LEB %d", + (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1)); + + memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ); + + if ((c->vfs_sb->s_flags & MS_RDONLY)) { + /* Read-only mode. Keep a copy for switching to rw mode */ + c->rcvrd_mst_node = kmalloc(sz, GFP_KERNEL); + if (!c->rcvrd_mst_node) { + err = -ENOMEM; + goto out_free; + } + memcpy(c->rcvrd_mst_node, c->mst_node, UBIFS_MST_NODE_SZ); + } else { + /* Write the recovered master node */ + c->max_sqnum = le64_to_cpu(mst->ch.sqnum) - 1; + err = write_rcvrd_mst_node(c, c->mst_node); + if (err) + goto out_free; + } + + vfree(buf2); + vfree(buf1); + + return 0; + +out_err: + err = -EINVAL; +out_free: + ubifs_err("failed to recover master node"); + if (mst1) { + dbg_err("dumping first master node"); + dbg_dump_node(c, mst1); + } + if (mst2) { + dbg_err("dumping second master node"); + dbg_dump_node(c, mst2); + } + vfree(buf2); + vfree(buf1); + return err; +} + +/** + * ubifs_write_rcvrd_mst_node - write the recovered master node. + * @c: UBIFS file-system description object + * + * This function writes the master node that was recovered during mounting in + * read-only mode and must now be written because we are remounting rw. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_write_rcvrd_mst_node(struct ubifs_info *c) +{ + int err; + + if (!c->rcvrd_mst_node) + return 0; + c->rcvrd_mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); + err = write_rcvrd_mst_node(c, c->rcvrd_mst_node); + if (err) + return err; + kfree(c->rcvrd_mst_node); + c->rcvrd_mst_node = NULL; + return 0; +} + +/** + * is_last_write - determine if an offset was in the last write to a LEB. + * @c: UBIFS file-system description object + * @buf: buffer to check + * @offs: offset to check + * + * This function returns %1 if @offs was in the last write to the LEB whose data + * is in @buf, otherwise %0 is returned. The determination is made by checking + * for subsequent empty space starting from the next min_io_size boundary (or a + * bit less than the common header size if min_io_size is one). + */ +static int is_last_write(const struct ubifs_info *c, void *buf, int offs) +{ + int empty_offs; + int check_len; + uint8_t *p; + + if (c->min_io_size == 1) { + check_len = c->leb_size - offs; + p = buf + check_len; + for (; check_len > 0; check_len--) + if (*--p != 0xff) + break; + /* + * 'check_len' is the size of the corruption which cannot be + * more than the size of 1 node if it was caused by an unclean + * unmount. + */ + if (check_len > UBIFS_MAX_NODE_SZ) + return 0; + return 1; + } + + /* + * Round up to the next c->min_io_size boundary i.e. 'offs' is in the + * last wbuf written. After that should be empty space. + */ + empty_offs = ALIGN(offs + 1, c->min_io_size); + check_len = c->leb_size - empty_offs; + p = buf + empty_offs - offs; + + for (; check_len > 0; check_len--) + if (*p++ != 0xff) + return 0; + return 1; +} + +/** + * clean_buf - clean the data from an LEB sitting in a buffer. + * @c: UBIFS file-system description object + * @buf: buffer to clean + * @lnum: LEB number to clean + * @offs: offset from which to clean + * @len: length of buffer + * + * This function pads up to the next min_io_size boundary (if there is one) and + * sets empty space to all 0xff. @buf, @offs and @len are updated to the next + * min_io_size boundary (if there is one). + */ +static void clean_buf(const struct ubifs_info *c, void **buf, int lnum, + int *offs, int *len) +{ + int empty_offs, pad_len; + + lnum = lnum; + dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs); + + if (c->min_io_size == 1) { + memset(*buf, 0xff, c->leb_size - *offs); + return; + } + + ubifs_assert(!(*offs & 7)); + empty_offs = ALIGN(*offs, c->min_io_size); + pad_len = empty_offs - *offs; + ubifs_pad(c, *buf, pad_len); + *offs += pad_len; + *buf += pad_len; + *len -= pad_len; + memset(*buf, 0xff, c->leb_size - empty_offs); +} + +/** + * no_more_nodes - determine if there are no more nodes in a buffer. + * @c: UBIFS file-system description object + * @buf: buffer to check + * @len: length of buffer + * @lnum: LEB number of the LEB from which @buf was read + * @offs: offset from which @buf was read + * + * This function ensures that the corrupted node at @offs is the last thing + * written to a LEB. This function returns %1 if more data is not found and + * %0 if more data is found. + */ +static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, + int lnum, int offs) +{ + struct ubifs_ch *ch = buf; + int skip, dlen = le32_to_cpu(ch->len); + + /* Check for empty space after the corrupt node's common header */ + skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs; + if (is_empty(buf + skip, len - skip)) + return 1; + /* + * The area after the common header size is not empty, so the common + * header must be intact. Check it. + */ + if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) { + dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs); + return 0; + } + /* Now we know the corrupt node's length we can skip over it */ + skip = ALIGN(offs + dlen, c->min_io_size) - offs; + /* After which there should be empty space */ + if (is_empty(buf + skip, len - skip)) + return 1; + dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip); + return 0; +} + +/** + * fix_unclean_leb - fix an unclean LEB. + * @c: UBIFS file-system description object + * @sleb: scanned LEB information + * @start: offset where scan started + */ +static int fix_unclean_leb(struct ubifs_info *c, struct ubifs_scan_leb *sleb, + int start) +{ + int lnum = sleb->lnum, endpt = start; + + /* Get the end offset of the last node we are keeping */ + if (!list_empty(&sleb->nodes)) { + struct ubifs_scan_node *snod; + + snod = list_entry(sleb->nodes.prev, + struct ubifs_scan_node, list); + endpt = snod->offs + snod->len; + } + + if ((c->vfs_sb->s_flags & MS_RDONLY) && !c->remounting_rw) { + /* Add to recovery list */ + struct ubifs_unclean_leb *ucleb; + + dbg_rcvry("need to fix LEB %d start %d endpt %d", + lnum, start, sleb->endpt); + ucleb = kzalloc(sizeof(struct ubifs_unclean_leb), GFP_NOFS); + if (!ucleb) + return -ENOMEM; + ucleb->lnum = lnum; + ucleb->endpt = endpt; + list_add_tail(&ucleb->list, &c->unclean_leb_list); + } else { + /* Write the fixed LEB back to flash */ + int err; + + dbg_rcvry("fixing LEB %d start %d endpt %d", + lnum, start, sleb->endpt); + if (endpt == 0) { + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + } else { + int len = ALIGN(endpt, c->min_io_size); + + if (start) { + err = ubi_read(c->ubi, lnum, sleb->buf, 0, + start); + if (err) + return err; + } + /* Pad to min_io_size */ + if (len > endpt) { + int pad_len = len - ALIGN(endpt, 8); + + if (pad_len > 0) { + void *buf = sleb->buf + len - pad_len; + + ubifs_pad(c, buf, pad_len); + } + } + err = ubi_leb_change(c->ubi, lnum, sleb->buf, len, + UBI_UNKNOWN); + if (err) + return err; + } + } + return 0; +} + +/** + * drop_incomplete_group - drop nodes from an incomplete group. + * @sleb: scanned LEB information + * @offs: offset of dropped nodes is returned here + * + * This function returns %1 if nodes are dropped and %0 otherwise. + */ +static int drop_incomplete_group(struct ubifs_scan_leb *sleb, int *offs) +{ + int dropped = 0; + + while (!list_empty(&sleb->nodes)) { + struct ubifs_scan_node *snod; + struct ubifs_ch *ch; + + snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node, + list); + ch = snod->node; + if (ch->group_type != UBIFS_IN_NODE_GROUP) + return dropped; + dbg_rcvry("dropping node at %d:%d", sleb->lnum, snod->offs); + *offs = snod->offs; + list_del(&snod->list); + kfree(snod); + sleb->nodes_cnt -= 1; + dropped = 1; + } + return dropped; +} + +/** + * ubifs_recover_leb - scan and recover a LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @offs: offset + * @sbuf: LEB-sized buffer to use + * @grouped: nodes may be grouped for recovery + * + * This function does a scan of a LEB, but caters for errors that might have + * been caused by the unclean unmount from which we are attempting to recover. + * + * This function returns %0 on success and a negative error code on failure. + */ +struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, + int offs, void *sbuf, int grouped) +{ + int err, len = c->leb_size - offs, need_clean = 0, quiet = 1; + int empty_chkd = 0, start = offs; + struct ubifs_scan_leb *sleb; + void *buf = sbuf + offs; + + dbg_rcvry("%d:%d", lnum, offs); + + sleb = ubifs_start_scan(c, lnum, offs, sbuf); + if (IS_ERR(sleb)) + return sleb; + + if (sleb->ecc) + need_clean = 1; + + while (len >= 8) { + int ret; + + dbg_scan("look at LEB %d:%d (%d bytes left)", + lnum, offs, len); + + cond_resched(); + + /* + * Scan quietly until there is an error from which we cannot + * recover + */ + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet); + + if (ret == SCANNED_A_NODE) { + /* A valid node, and not a padding node */ + struct ubifs_ch *ch = buf; + int node_len; + + err = ubifs_add_snod(c, sleb, buf, offs); + if (err) + goto error; + node_len = ALIGN(le32_to_cpu(ch->len), 8); + offs += node_len; + buf += node_len; + len -= node_len; + continue; + } + + if (ret > 0) { + /* Padding bytes or a valid padding node */ + offs += ret; + buf += ret; + len -= ret; + continue; + } + + if (ret == SCANNED_EMPTY_SPACE) { + if (!is_empty(buf, len)) { + if (!is_last_write(c, buf, offs)) + break; + clean_buf(c, &buf, lnum, &offs, &len); + need_clean = 1; + } + empty_chkd = 1; + break; + } + + if (ret == SCANNED_GARBAGE || ret == SCANNED_A_BAD_PAD_NODE) + if (is_last_write(c, buf, offs)) { + clean_buf(c, &buf, lnum, &offs, &len); + need_clean = 1; + empty_chkd = 1; + break; + } + + if (ret == SCANNED_A_CORRUPT_NODE) + if (no_more_nodes(c, buf, len, lnum, offs)) { + clean_buf(c, &buf, lnum, &offs, &len); + need_clean = 1; + empty_chkd = 1; + break; + } + + if (quiet) { + /* Redo the last scan but noisily */ + quiet = 0; + continue; + } + + switch (ret) { + case SCANNED_GARBAGE: + dbg_err("garbage"); + goto corrupted; + case SCANNED_A_CORRUPT_NODE: + case SCANNED_A_BAD_PAD_NODE: + dbg_err("bad node"); + goto corrupted; + default: + dbg_err("unknown"); + goto corrupted; + } + } + + if (!empty_chkd && !is_empty(buf, len)) { + if (is_last_write(c, buf, offs)) { + clean_buf(c, &buf, lnum, &offs, &len); + need_clean = 1; + } else { + ubifs_err("corrupt empty space at LEB %d:%d", + lnum, offs); + goto corrupted; + } + } + + /* Drop nodes from incomplete group */ + if (grouped && drop_incomplete_group(sleb, &offs)) { + buf = sbuf + offs; + len = c->leb_size - offs; + clean_buf(c, &buf, lnum, &offs, &len); + need_clean = 1; + } + + if (offs % c->min_io_size) { + clean_buf(c, &buf, lnum, &offs, &len); + need_clean = 1; + } + + ubifs_end_scan(c, sleb, lnum, offs); + + if (need_clean) { + err = fix_unclean_leb(c, sleb, start); + if (err) + goto error; + } + + return sleb; + +corrupted: + ubifs_scanned_corruption(c, lnum, offs, buf); + err = -EUCLEAN; +error: + ubifs_err("LEB %d scanning failed", lnum); + ubifs_scan_destroy(sleb); + return ERR_PTR(err); +} + +/** + * get_cs_sqnum - get commit start sequence number. + * @c: UBIFS file-system description object + * @lnum: LEB number of commit start node + * @offs: offset of commit start node + * @cs_sqnum: commit start sequence number is returned here + * + * This function returns %0 on success and a negative error code on failure. + */ +static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs, + unsigned long long *cs_sqnum) +{ + struct ubifs_cs_node *cs_node = NULL; + int err, ret; + + dbg_rcvry("at %d:%d", lnum, offs); + cs_node = kmalloc(UBIFS_CS_NODE_SZ, GFP_KERNEL); + if (!cs_node) + return -ENOMEM; + if (c->leb_size - offs < UBIFS_CS_NODE_SZ) + goto out_err; + err = ubi_read(c->ubi, lnum, (void *)cs_node, offs, UBIFS_CS_NODE_SZ); + if (err && err != -EBADMSG) + goto out_free; + ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0); + if (ret != SCANNED_A_NODE) { + dbg_err("Not a valid node"); + goto out_err; + } + if (cs_node->ch.node_type != UBIFS_CS_NODE) { + dbg_err("Node a CS node, type is %d", cs_node->ch.node_type); + goto out_err; + } + if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) { + dbg_err("CS node cmt_no %llu != current cmt_no %llu", + (unsigned long long)le64_to_cpu(cs_node->cmt_no), + c->cmt_no); + goto out_err; + } + *cs_sqnum = le64_to_cpu(cs_node->ch.sqnum); + dbg_rcvry("commit start sqnum %llu", *cs_sqnum); + kfree(cs_node); + return 0; + +out_err: + err = -EINVAL; +out_free: + ubifs_err("failed to get CS sqnum"); + kfree(cs_node); + return err; +} + +/** + * ubifs_recover_log_leb - scan and recover a log LEB. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @offs: offset + * @sbuf: LEB-sized buffer to use + * + * This function does a scan of a LEB, but caters for errors that might have + * been caused by the unclean unmount from which we are attempting to recover. + * + * This function returns %0 on success and a negative error code on failure. + */ +struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, + int offs, void *sbuf) +{ + struct ubifs_scan_leb *sleb; + int next_lnum; + + dbg_rcvry("LEB %d", lnum); + next_lnum = lnum + 1; + if (next_lnum >= UBIFS_LOG_LNUM + c->log_lebs) + next_lnum = UBIFS_LOG_LNUM; + if (next_lnum != c->ltail_lnum) { + /* + * We can only recover at the end of the log, so check that the + * next log LEB is empty or out of date. + */ + sleb = ubifs_scan(c, next_lnum, 0, sbuf); + if (IS_ERR(sleb)) + return sleb; + if (sleb->nodes_cnt) { + struct ubifs_scan_node *snod; + unsigned long long cs_sqnum = c->cs_sqnum; + + snod = list_entry(sleb->nodes.next, + struct ubifs_scan_node, list); + if (cs_sqnum == 0) { + int err; + + err = get_cs_sqnum(c, lnum, offs, &cs_sqnum); + if (err) { + ubifs_scan_destroy(sleb); + return ERR_PTR(err); + } + } + if (snod->sqnum > cs_sqnum) { + ubifs_err("unrecoverable log corruption " + "in LEB %d", lnum); + ubifs_scan_destroy(sleb); + return ERR_PTR(-EUCLEAN); + } + } + ubifs_scan_destroy(sleb); + } + return ubifs_recover_leb(c, lnum, offs, sbuf, 0); +} + +/** + * recover_head - recover a head. + * @c: UBIFS file-system description object + * @lnum: LEB number of head to recover + * @offs: offset of head to recover + * @sbuf: LEB-sized buffer to use + * + * This function ensures that there is no data on the flash at a head location. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int recover_head(const struct ubifs_info *c, int lnum, int offs, + void *sbuf) +{ + int len, err, need_clean = 0; + + if (c->min_io_size > 1) + len = c->min_io_size; + else + len = 512; + if (offs + len > c->leb_size) + len = c->leb_size - offs; + + if (!len) + return 0; + + /* Read at the head location and check it is empty flash */ + err = ubi_read(c->ubi, lnum, sbuf, offs, len); + if (err) + need_clean = 1; + else { + uint8_t *p = sbuf; + + while (len--) + if (*p++ != 0xff) { + need_clean = 1; + break; + } + } + + if (need_clean) { + dbg_rcvry("cleaning head at %d:%d", lnum, offs); + if (offs == 0) + return ubifs_leb_unmap(c, lnum); + err = ubi_read(c->ubi, lnum, sbuf, 0, offs); + if (err) + return err; + return ubi_leb_change(c->ubi, lnum, sbuf, offs, UBI_UNKNOWN); + } + + return 0; +} + +/** + * ubifs_recover_inl_heads - recover index and LPT heads. + * @c: UBIFS file-system description object + * @sbuf: LEB-sized buffer to use + * + * This function ensures that there is no data on the flash at the index and + * LPT head locations. + * + * This deals with the recovery of a half-completed journal commit. UBIFS is + * careful never to overwrite the last version of the index or the LPT. Because + * the index and LPT are wandering trees, data from a half-completed commit will + * not be referenced anywhere in UBIFS. The data will be either in LEBs that are + * assumed to be empty and will be unmapped anyway before use, or in the index + * and LPT heads. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf) +{ + int err; + + ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY) || c->remounting_rw); + + dbg_rcvry("checking index head at %d:%d", c->ihead_lnum, c->ihead_offs); + err = recover_head(c, c->ihead_lnum, c->ihead_offs, sbuf); + if (err) + return err; + + dbg_rcvry("checking LPT head at %d:%d", c->nhead_lnum, c->nhead_offs); + err = recover_head(c, c->nhead_lnum, c->nhead_offs, sbuf); + if (err) + return err; + + return 0; +} + +/** + * clean_an_unclean_leb - read and write a LEB to remove corruption. + * @c: UBIFS file-system description object + * @ucleb: unclean LEB information + * @sbuf: LEB-sized buffer to use + * + * This function reads a LEB up to a point pre-determined by the mount recovery, + * checks the nodes, and writes the result back to the flash, thereby cleaning + * off any following corruption, or non-fatal ECC errors. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int clean_an_unclean_leb(const struct ubifs_info *c, + struct ubifs_unclean_leb *ucleb, void *sbuf) +{ + int err, lnum = ucleb->lnum, offs = 0, len = ucleb->endpt, quiet = 1; + void *buf = sbuf; + + dbg_rcvry("LEB %d len %d", lnum, len); + + if (len == 0) { + /* Nothing to read, just unmap it */ + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + return 0; + } + + err = ubi_read(c->ubi, lnum, buf, offs, len); + if (err && err != -EBADMSG) + return err; + + while (len >= 8) { + int ret; + + cond_resched(); + + /* Scan quietly until there is an error */ + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, quiet); + + if (ret == SCANNED_A_NODE) { + /* A valid node, and not a padding node */ + struct ubifs_ch *ch = buf; + int node_len; + + node_len = ALIGN(le32_to_cpu(ch->len), 8); + offs += node_len; + buf += node_len; + len -= node_len; + continue; + } + + if (ret > 0) { + /* Padding bytes or a valid padding node */ + offs += ret; + buf += ret; + len -= ret; + continue; + } + + if (ret == SCANNED_EMPTY_SPACE) { + ubifs_err("unexpected empty space at %d:%d", + lnum, offs); + return -EUCLEAN; + } + + if (quiet) { + /* Redo the last scan but noisily */ + quiet = 0; + continue; + } + + ubifs_scanned_corruption(c, lnum, offs, buf); + return -EUCLEAN; + } + + /* Pad to min_io_size */ + len = ALIGN(ucleb->endpt, c->min_io_size); + if (len > ucleb->endpt) { + int pad_len = len - ALIGN(ucleb->endpt, 8); + + if (pad_len > 0) { + buf = c->sbuf + len - pad_len; + ubifs_pad(c, buf, pad_len); + } + } + + /* Write back the LEB atomically */ + err = ubi_leb_change(c->ubi, lnum, sbuf, len, UBI_UNKNOWN); + if (err) + return err; + + dbg_rcvry("cleaned LEB %d", lnum); + + return 0; +} + +/** + * ubifs_clean_lebs - clean LEBs recovered during read-only mount. + * @c: UBIFS file-system description object + * @sbuf: LEB-sized buffer to use + * + * This function cleans a LEB identified during recovery that needs to be + * written but was not because UBIFS was mounted read-only. This happens when + * remounting to read-write mode. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf) +{ + dbg_rcvry("recovery"); + while (!list_empty(&c->unclean_leb_list)) { + struct ubifs_unclean_leb *ucleb; + int err; + + ucleb = list_entry(c->unclean_leb_list.next, + struct ubifs_unclean_leb, list); + err = clean_an_unclean_leb(c, ucleb, sbuf); + if (err) + return err; + list_del(&ucleb->list); + kfree(ucleb); + } + return 0; +} + +/** + * ubifs_rcvry_gc_commit - recover the GC LEB number and run the commit. + * @c: UBIFS file-system description object + * + * Out-of-place garbage collection requires always one empty LEB with which to + * start garbage collection. The LEB number is recorded in c->gc_lnum and is + * written to the master node on unmounting. In the case of an unclean unmount + * the value of gc_lnum recorded in the master node is out of date and cannot + * be used. Instead, recovery must allocate an empty LEB for this purpose. + * However, there may not be enough empty space, in which case it must be + * possible to GC the dirtiest LEB into the GC head LEB. + * + * This function also runs the commit which causes the TNC updates from + * size-recovery and orphans to be written to the flash. That is important to + * ensure correct replay order for subsequent mounts. + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_rcvry_gc_commit(struct ubifs_info *c) +{ + struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf; + struct ubifs_lprops lp; + int lnum, err; + + c->gc_lnum = -1; + if (wbuf->lnum == -1) { + dbg_rcvry("no GC head LEB"); + goto find_free; + } + /* + * See whether the used space in the dirtiest LEB fits in the GC head + * LEB. + */ + if (wbuf->offs == c->leb_size) { + dbg_rcvry("no room in GC head LEB"); + goto find_free; + } + err = ubifs_find_dirty_leb(c, &lp, wbuf->offs, 2); + if (err) { + if (err == -ENOSPC) + dbg_err("could not find a dirty LEB"); + return err; + } + ubifs_assert(!(lp.flags & LPROPS_INDEX)); + lnum = lp.lnum; + if (lp.free + lp.dirty == c->leb_size) { + /* An empty LEB was returned */ + if (lp.free != c->leb_size) { + err = ubifs_change_one_lp(c, lnum, c->leb_size, + 0, 0, 0, 0); + if (err) + return err; + } + err = ubifs_leb_unmap(c, lnum); + if (err) + return err; + c->gc_lnum = lnum; + dbg_rcvry("allocated LEB %d for GC", lnum); + /* Run the commit */ + dbg_rcvry("committing"); + return ubifs_run_commit(c); + } + /* + * There was no empty LEB so the used space in the dirtiest LEB must fit + * in the GC head LEB. + */ + if (lp.free + lp.dirty < wbuf->offs) { + dbg_rcvry("LEB %d doesn't fit in GC head LEB %d:%d", + lnum, wbuf->lnum, wbuf->offs); + err = ubifs_return_leb(c, lnum); + if (err) + return err; + goto find_free; + } + /* + * We run the commit before garbage collection otherwise subsequent + * mounts will see the GC and orphan deletion in a different order. + */ + dbg_rcvry("committing"); + err = ubifs_run_commit(c); + if (err) + return err; + /* + * The data in the dirtiest LEB fits in the GC head LEB, so do the GC + * - use locking to keep 'ubifs_assert()' happy. + */ + dbg_rcvry("GC'ing LEB %d", lnum); + mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); + err = ubifs_garbage_collect_leb(c, &lp); + if (err >= 0) { + int err2 = ubifs_wbuf_sync_nolock(wbuf); + + if (err2) + err = err2; + } + mutex_unlock(&wbuf->io_mutex); + if (err < 0) { + dbg_err("GC failed, error %d", err); + if (err == -EAGAIN) + err = -EINVAL; + return err; + } + if (err != LEB_RETAINED) { + dbg_err("GC returned %d", err); + return -EINVAL; + } + err = ubifs_leb_unmap(c, c->gc_lnum); + if (err) + return err; + dbg_rcvry("allocated LEB %d for GC", lnum); + return 0; + +find_free: + /* + * There is no GC head LEB or the free space in the GC head LEB is too + * small. Allocate gc_lnum by calling 'ubifs_find_free_leb_for_idx()' so + * GC is not run. + */ + lnum = ubifs_find_free_leb_for_idx(c); + if (lnum < 0) { + dbg_err("could not find an empty LEB"); + return lnum; + } + /* And reset the index flag */ + err = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_INDEX, 0); + if (err) + return err; + c->gc_lnum = lnum; + dbg_rcvry("allocated LEB %d for GC", lnum); + /* Run the commit */ + dbg_rcvry("committing"); + return ubifs_run_commit(c); +} + +/** + * struct size_entry - inode size information for recovery. + * @rb: link in the RB-tree of sizes + * @inum: inode number + * @i_size: size on inode + * @d_size: maximum size based on data nodes + * @exists: indicates whether the inode exists + * @inode: inode if pinned in memory awaiting rw mode to fix it + */ +struct size_entry { + struct rb_node rb; + ino_t inum; + loff_t i_size; + loff_t d_size; + int exists; + struct inode *inode; +}; + +/** + * add_ino - add an entry to the size tree. + * @c: UBIFS file-system description object + * @inum: inode number + * @i_size: size on inode + * @d_size: maximum size based on data nodes + * @exists: indicates whether the inode exists + */ +static int add_ino(struct ubifs_info *c, ino_t inum, loff_t i_size, + loff_t d_size, int exists) +{ + struct rb_node **p = &c->size_tree.rb_node, *parent = NULL; + struct size_entry *e; + + while (*p) { + parent = *p; + e = rb_entry(parent, struct size_entry, rb); + if (inum < e->inum) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + e = kzalloc(sizeof(struct size_entry), GFP_KERNEL); + if (!e) + return -ENOMEM; + + e->inum = inum; + e->i_size = i_size; + e->d_size = d_size; + e->exists = exists; + + rb_link_node(&e->rb, parent, p); + rb_insert_color(&e->rb, &c->size_tree); + + return 0; +} + +/** + * find_ino - find an entry on the size tree. + * @c: UBIFS file-system description object + * @inum: inode number + */ +static struct size_entry *find_ino(struct ubifs_info *c, ino_t inum) +{ + struct rb_node *p = c->size_tree.rb_node; + struct size_entry *e; + + while (p) { + e = rb_entry(p, struct size_entry, rb); + if (inum < e->inum) + p = p->rb_left; + else if (inum > e->inum) + p = p->rb_right; + else + return e; + } + return NULL; +} + +/** + * remove_ino - remove an entry from the size tree. + * @c: UBIFS file-system description object + * @inum: inode number + */ +static void remove_ino(struct ubifs_info *c, ino_t inum) +{ + struct size_entry *e = find_ino(c, inum); + + if (!e) + return; + rb_erase(&e->rb, &c->size_tree); + kfree(e); +} + +/** + * ubifs_destroy_size_tree - free resources related to the size tree. + * @c: UBIFS file-system description object + */ +void ubifs_destroy_size_tree(struct ubifs_info *c) +{ + struct rb_node *this = c->size_tree.rb_node; + struct size_entry *e; + + while (this) { + if (this->rb_left) { + this = this->rb_left; + continue; + } else if (this->rb_right) { + this = this->rb_right; + continue; + } + e = rb_entry(this, struct size_entry, rb); + if (e->inode) + iput(e->inode); + this = rb_parent(this); + if (this) { + if (this->rb_left == &e->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + kfree(e); + } + c->size_tree = RB_ROOT; +} + +/** + * ubifs_recover_size_accum - accumulate inode sizes for recovery. + * @c: UBIFS file-system description object + * @key: node key + * @deletion: node is for a deletion + * @new_size: inode size + * + * This function has two purposes: + * 1) to ensure there are no data nodes that fall outside the inode size + * 2) to ensure there are no data nodes for inodes that do not exist + * To accomplish those purposes, a rb-tree is constructed containing an entry + * for each inode number in the journal that has not been deleted, and recording + * the size from the inode node, the maximum size of any data node (also altered + * by truncations) and a flag indicating a inode number for which no inode node + * was present in the journal. + * + * Note that there is still the possibility that there are data nodes that have + * been committed that are beyond the inode size, however the only way to find + * them would be to scan the entire index. Alternatively, some provision could + * be made to record the size of inodes at the start of commit, which would seem + * very cumbersome for a scenario that is quite unlikely and the only negative + * consequence of which is wasted space. + * + * This functions returns %0 on success and a negative error code on failure. + */ +int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key, + int deletion, loff_t new_size) +{ + ino_t inum = key_inum(c, key); + struct size_entry *e; + int err; + + switch (key_type(c, key)) { + case UBIFS_INO_KEY: + if (deletion) + remove_ino(c, inum); + else { + e = find_ino(c, inum); + if (e) { + e->i_size = new_size; + e->exists = 1; + } else { + err = add_ino(c, inum, new_size, 0, 1); + if (err) + return err; + } + } + break; + case UBIFS_DATA_KEY: + e = find_ino(c, inum); + if (e) { + if (new_size > e->d_size) + e->d_size = new_size; + } else { + err = add_ino(c, inum, 0, new_size, 0); + if (err) + return err; + } + break; + case UBIFS_TRUN_KEY: + e = find_ino(c, inum); + if (e) + e->d_size = new_size; + break; + } + return 0; +} + +/** + * fix_size_in_place - fix inode size in place on flash. + * @c: UBIFS file-system description object + * @e: inode size information for recovery + */ +static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e) +{ + struct ubifs_ino_node *ino = c->sbuf; + unsigned char *p; + union ubifs_key key; + int err, lnum, offs, len; + loff_t i_size; + uint32_t crc; + + /* Locate the inode node LEB number and offset */ + ino_key_init(c, &key, e->inum); + err = ubifs_tnc_locate(c, &key, ino, &lnum, &offs); + if (err) + goto out; + /* + * If the size recorded on the inode node is greater than the size that + * was calculated from nodes in the journal then don't change the inode. + */ + i_size = le64_to_cpu(ino->size); + if (i_size >= e->d_size) + return 0; + /* Read the LEB */ + err = ubi_read(c->ubi, lnum, c->sbuf, 0, c->leb_size); + if (err) + goto out; + /* Change the size field and recalculate the CRC */ + ino = c->sbuf + offs; + ino->size = cpu_to_le64(e->d_size); + len = le32_to_cpu(ino->ch.len); + crc = crc32(UBIFS_CRC32_INIT, (void *)ino + 8, len - 8); + ino->ch.crc = cpu_to_le32(crc); + /* Work out where data in the LEB ends and free space begins */ + p = c->sbuf; + len = c->leb_size - 1; + while (p[len] == 0xff) + len -= 1; + len = ALIGN(len + 1, c->min_io_size); + /* Atomically write the fixed LEB back again */ + err = ubi_leb_change(c->ubi, lnum, c->sbuf, len, UBI_UNKNOWN); + if (err) + goto out; + dbg_rcvry("inode %lu at %d:%d size %lld -> %lld ", + (unsigned long)e->inum, lnum, offs, i_size, e->d_size); + return 0; + +out: + ubifs_warn("inode %lu failed to fix size %lld -> %lld error %d", + (unsigned long)e->inum, e->i_size, e->d_size, err); + return err; +} + +/** + * ubifs_recover_size - recover inode size. + * @c: UBIFS file-system description object + * + * This function attempts to fix inode size discrepancies identified by the + * 'ubifs_recover_size_accum()' function. + * + * This functions returns %0 on success and a negative error code on failure. + */ +int ubifs_recover_size(struct ubifs_info *c) +{ + struct rb_node *this = rb_first(&c->size_tree); + + while (this) { + struct size_entry *e; + int err; + + e = rb_entry(this, struct size_entry, rb); + if (!e->exists) { + union ubifs_key key; + + ino_key_init(c, &key, e->inum); + err = ubifs_tnc_lookup(c, &key, c->sbuf); + if (err && err != -ENOENT) + return err; + if (err == -ENOENT) { + /* Remove data nodes that have no inode */ + dbg_rcvry("removing ino %lu", + (unsigned long)e->inum); + err = ubifs_tnc_remove_ino(c, e->inum); + if (err) + return err; + } else { + struct ubifs_ino_node *ino = c->sbuf; + + e->exists = 1; + e->i_size = le64_to_cpu(ino->size); + } + } + if (e->exists && e->i_size < e->d_size) { + if (!e->inode && (c->vfs_sb->s_flags & MS_RDONLY)) { + /* Fix the inode size and pin it in memory */ + struct inode *inode; + + inode = ubifs_iget(c->vfs_sb, e->inum); + if (IS_ERR(inode)) + return PTR_ERR(inode); + if (inode->i_size < e->d_size) { + dbg_rcvry("ino %lu size %lld -> %lld", + (unsigned long)e->inum, + e->d_size, inode->i_size); + inode->i_size = e->d_size; + ubifs_inode(inode)->ui_size = e->d_size; + e->inode = inode; + this = rb_next(this); + continue; + } + iput(inode); + } else { + /* Fix the size in place */ + err = fix_size_in_place(c, e); + if (err) + return err; + if (e->inode) + iput(e->inode); + } + } + this = rb_next(this); + rb_erase(&e->rb, &c->size_tree); + kfree(e); + } + return 0; +} diff -Nurd linux-2.6.24/fs/ubifs/replay.c ubifs-v2.6.24/fs/ubifs/replay.c --- linux-2.6.24/fs/ubifs/replay.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/replay.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,1084 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file contains journal replay code. It runs when the file-system is being + * mounted and requires no locking. + * + * The larger is the journal, the longer it takes to scan it, so the longer it + * takes to mount UBIFS. This is why the journal has limited size which may be + * changed depending on the system requirements. But a larger journal gives + * faster I/O speed because it writes the index less frequently. So this is a + * trade-off. Also, the journal is indexed by the in-memory index (TNC), so the + * larger is the journal, the more memory its index may consume. + */ + +#include "ubifs.h" + +/* + * Replay flags. + * + * REPLAY_DELETION: node was deleted + * REPLAY_REF: node is a reference node + */ +enum { + REPLAY_DELETION = 1, + REPLAY_REF = 2, +}; + +/** + * struct replay_entry - replay tree entry. + * @lnum: logical eraseblock number of the node + * @offs: node offset + * @len: node length + * @sqnum: node sequence number + * @flags: replay flags + * @rb: links the replay tree + * @key: node key + * @nm: directory entry name + * @old_size: truncation old size + * @new_size: truncation new size + * @free: amount of free space in a bud + * @dirty: amount of dirty space in a bud from padding and deletion nodes + * + * UBIFS journal replay must compare node sequence numbers, which means it must + * build a tree of node information to insert into the TNC. + */ +struct replay_entry { + int lnum; + int offs; + int len; + unsigned long long sqnum; + int flags; + struct rb_node rb; + union ubifs_key key; + union { + struct qstr nm; + struct { + loff_t old_size; + loff_t new_size; + }; + struct { + int free; + int dirty; + }; + }; +}; + +/** + * struct bud_entry - entry in the list of buds to replay. + * @list: next bud in the list + * @bud: bud description object + * @free: free bytes in the bud + * @sqnum: reference node sequence number + */ +struct bud_entry { + struct list_head list; + struct ubifs_bud *bud; + int free; + unsigned long long sqnum; +}; + +/** + * set_bud_lprops - set free and dirty space used by a bud. + * @c: UBIFS file-system description object + * @r: replay entry of bud + */ +static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r) +{ + const struct ubifs_lprops *lp; + int err = 0, dirty; + + ubifs_get_lprops(c); + + lp = ubifs_lpt_lookup_dirty(c, r->lnum); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + dirty = lp->dirty; + if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) { + /* + * The LEB was added to the journal with a starting offset of + * zero which means the LEB must have been empty. The LEB + * property values should be lp->free == c->leb_size and + * lp->dirty == 0, but that is not the case. The reason is that + * the LEB was garbage collected. The garbage collector resets + * the free and dirty space without recording it anywhere except + * lprops, so if there is not a commit then lprops does not have + * that information next time the file system is mounted. + * + * We do not need to adjust free space because the scan has told + * us the exact value which is recorded in the replay entry as + * r->free. + * + * However we do need to subtract from the dirty space the + * amount of space that the garbage collector reclaimed, which + * is the whole LEB minus the amount of space that was free. + */ + dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum, + lp->free, lp->dirty); + dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum, + lp->free, lp->dirty); + dirty -= c->leb_size - lp->free; + /* + * If the replay order was perfect the dirty space would now be + * zero. The order is not perfect because the journal heads + * race with each other. This is not a problem but is does mean + * that the dirty space may temporarily exceed c->leb_size + * during the replay. + */ + if (dirty != 0) + dbg_msg("LEB %d lp: %d free %d dirty " + "replay: %d free %d dirty", r->lnum, lp->free, + lp->dirty, r->free, r->dirty); + } + lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty, + lp->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } +out: + ubifs_release_lprops(c); + return err; +} + +/** + * trun_remove_range - apply a replay entry for a truncation to the TNC. + * @c: UBIFS file-system description object + * @r: replay entry of truncation + */ +static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r) +{ + unsigned min_blk, max_blk; + union ubifs_key min_key, max_key; + ino_t ino; + + min_blk = r->new_size / UBIFS_BLOCK_SIZE; + if (r->new_size & (UBIFS_BLOCK_SIZE - 1)) + min_blk += 1; + + max_blk = r->old_size / UBIFS_BLOCK_SIZE; + if ((r->old_size & (UBIFS_BLOCK_SIZE - 1)) == 0) + max_blk -= 1; + + ino = key_inum(c, &r->key); + + data_key_init(c, &min_key, ino, min_blk); + data_key_init(c, &max_key, ino, max_blk); + + return ubifs_tnc_remove_range(c, &min_key, &max_key); +} + +/** + * apply_replay_entry - apply a replay entry to the TNC. + * @c: UBIFS file-system description object + * @r: replay entry to apply + * + * Apply a replay entry to the TNC. + */ +static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r) +{ + int err, deletion = ((r->flags & REPLAY_DELETION) != 0); + + dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum, + r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key)); + + /* Set c->replay_sqnum to help deal with dangling branches. */ + c->replay_sqnum = r->sqnum; + + if (r->flags & REPLAY_REF) + err = set_bud_lprops(c, r); + else if (is_hash_key(c, &r->key)) { + if (deletion) + err = ubifs_tnc_remove_nm(c, &r->key, &r->nm); + else + err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs, + r->len, &r->nm); + } else { + if (deletion) + switch (key_type(c, &r->key)) { + case UBIFS_INO_KEY: + { + ino_t inum = key_inum(c, &r->key); + + err = ubifs_tnc_remove_ino(c, inum); + break; + } + case UBIFS_TRUN_KEY: + err = trun_remove_range(c, r); + break; + default: + err = ubifs_tnc_remove(c, &r->key); + break; + } + else + err = ubifs_tnc_add(c, &r->key, r->lnum, r->offs, + r->len); + if (err) + return err; + + if (c->need_recovery) + err = ubifs_recover_size_accum(c, &r->key, deletion, + r->new_size); + } + + return err; +} + +/** + * destroy_replay_tree - destroy the replay. + * @c: UBIFS file-system description object + * + * Destroy the replay tree. + */ +static void destroy_replay_tree(struct ubifs_info *c) +{ + struct rb_node *this = c->replay_tree.rb_node; + struct replay_entry *r; + + while (this) { + if (this->rb_left) { + this = this->rb_left; + continue; + } else if (this->rb_right) { + this = this->rb_right; + continue; + } + r = rb_entry(this, struct replay_entry, rb); + this = rb_parent(this); + if (this) { + if (this->rb_left == &r->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + if (is_hash_key(c, &r->key)) + kfree(r->nm.name); + kfree(r); + } + c->replay_tree = RB_ROOT; +} + +/** + * apply_replay_tree - apply the replay tree to the TNC. + * @c: UBIFS file-system description object + * + * Apply the replay tree. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +static int apply_replay_tree(struct ubifs_info *c) +{ + struct rb_node *this = rb_first(&c->replay_tree); + + while (this) { + struct replay_entry *r; + int err; + + cond_resched(); + + r = rb_entry(this, struct replay_entry, rb); + err = apply_replay_entry(c, r); + if (err) + return err; + this = rb_next(this); + } + return 0; +} + +/** + * insert_node - insert a node to the replay tree. + * @c: UBIFS file-system description object + * @lnum: node logical eraseblock number + * @offs: node offset + * @len: node length + * @key: node key + * @sqnum: sequence number + * @deletion: non-zero if this is a deletion + * @used: number of bytes in use in a LEB + * @old_size: truncation old size + * @new_size: truncation new size + * + * This function inserts a scanned non-direntry node to the replay tree. The + * replay tree is an RB-tree containing @struct replay_entry elements which are + * indexed by the sequence number. The replay tree is applied at the very end + * of the replay process. Since the tree is sorted in sequence number order, + * the older modifications are applied first. This function returns zero in + * case of success and a negative error code in case of failure. + */ +static int insert_node(struct ubifs_info *c, int lnum, int offs, int len, + union ubifs_key *key, unsigned long long sqnum, + int deletion, int *used, loff_t old_size, + loff_t new_size) +{ + struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL; + struct replay_entry *r; + + if (key_inum(c, key) >= c->highest_inum) + c->highest_inum = key_inum(c, key); + + dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); + while (*p) { + parent = *p; + r = rb_entry(parent, struct replay_entry, rb); + if (sqnum < r->sqnum) { + p = &(*p)->rb_left; + continue; + } else if (sqnum > r->sqnum) { + p = &(*p)->rb_right; + continue; + } + ubifs_err("duplicate sqnum in replay"); + return -EINVAL; + } + + r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); + if (!r) + return -ENOMEM; + + if (!deletion) + *used += ALIGN(len, 8); + r->lnum = lnum; + r->offs = offs; + r->len = len; + r->sqnum = sqnum; + r->flags = (deletion ? REPLAY_DELETION : 0); + r->old_size = old_size; + r->new_size = new_size; + key_copy(c, key, &r->key); + + rb_link_node(&r->rb, parent, p); + rb_insert_color(&r->rb, &c->replay_tree); + return 0; +} + +/** + * insert_dent - insert a directory entry node into the replay tree. + * @c: UBIFS file-system description object + * @lnum: node logical eraseblock number + * @offs: node offset + * @len: node length + * @key: node key + * @name: directory entry name + * @nlen: directory entry name length + * @sqnum: sequence number + * @deletion: non-zero if this is a deletion + * @used: number of bytes in use in a LEB + * + * This function inserts a scanned directory entry node to the replay tree. + * Returns zero in case of success and a negative error code in case of + * failure. + * + * This function is also used for extended attribute entries because they are + * implemented as directory entry nodes. + */ +static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, + union ubifs_key *key, const char *name, int nlen, + unsigned long long sqnum, int deletion, int *used) +{ + struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL; + struct replay_entry *r; + char *nbuf; + + if (key_inum(c, key) >= c->highest_inum) + c->highest_inum = key_inum(c, key); + + dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key)); + while (*p) { + parent = *p; + r = rb_entry(parent, struct replay_entry, rb); + if (sqnum < r->sqnum) { + p = &(*p)->rb_left; + continue; + } + if (sqnum > r->sqnum) { + p = &(*p)->rb_right; + continue; + } + ubifs_err("duplicate sqnum in replay"); + return -EINVAL; + } + + r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); + if (!r) + return -ENOMEM; + nbuf = kmalloc(nlen + 1, GFP_KERNEL); + if (!nbuf) { + kfree(r); + return -ENOMEM; + } + + if (!deletion) + *used += ALIGN(len, 8); + r->lnum = lnum; + r->offs = offs; + r->len = len; + r->sqnum = sqnum; + r->nm.len = nlen; + memcpy(nbuf, name, nlen); + nbuf[nlen] = '\0'; + r->nm.name = nbuf; + r->flags = (deletion ? REPLAY_DELETION : 0); + key_copy(c, key, &r->key); + + ubifs_assert(!*p); + rb_link_node(&r->rb, parent, p); + rb_insert_color(&r->rb, &c->replay_tree); + return 0; +} + +/** + * ubifs_validate_entry - validate directory or extended attribute entry node. + * @c: UBIFS file-system description object + * @dent: the node to validate + * + * This function validates directory or extended attribute entry node @dent. + * Returns zero if the node is all right and a %-EINVAL if not. + */ +int ubifs_validate_entry(struct ubifs_info *c, + const struct ubifs_dent_node *dent) +{ + int key_type = key_type_flash(c, dent->key); + int nlen = le16_to_cpu(dent->nlen); + + if (le32_to_cpu(dent->ch.len) != nlen + UBIFS_DENT_NODE_SZ + 1 || + dent->type >= UBIFS_ITYPES_CNT || + nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 || + strnlen(dent->name, nlen) != nlen || + le64_to_cpu(dent->inum) > MAX_INUM) { + ubifs_err("bad %s node", key_type == UBIFS_DENT_KEY ? + "directory entry" : "extended attribute entry"); + return -EINVAL; + } + + if (key_type != UBIFS_DENT_KEY && key_type != UBIFS_XENT_KEY) { + ubifs_err("bad key type %d", key_type); + return -EINVAL; + } + + return 0; +} + +/** + * replay_bud - replay a bud logical eraseblock. + * @c: UBIFS file-system description object + * @lnum: bud logical eraseblock number to replay + * @offs: bud start offset + * @jhead: journal head to which this bud belongs + * @free: amount of free space in the bud is returned here + * @dirty: amount of dirty space from padding and deletion nodes is returned + * here + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, + int *free, int *dirty) +{ + int err = 0, used = 0; + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + struct ubifs_bud *bud; + + dbg_mnt("replay bud LEB %d, head %d", lnum, jhead); + if (c->need_recovery) + sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD); + else + sleb = ubifs_scan(c, lnum, offs, c->sbuf); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + + /* + * The bud does not have to start from offset zero - the beginning of + * the 'lnum' LEB may contain previously committed data. One of the + * things we have to do in replay is to correctly update lprops with + * newer information about this LEB. + * + * At this point lprops thinks that this LEB has 'c->leb_size - offs' + * bytes of free space because it only contain information about + * committed data. + * + * But we know that real amount of free space is 'c->leb_size - + * sleb->endpt', and the space in the 'lnum' LEB between 'offs' and + * 'sleb->endpt' is used by bud data. We have to correctly calculate + * how much of these data are dirty and update lprops with this + * information. + * + * The dirt in that LEB region is comprised of padding nodes, deletion + * nodes, truncation nodes and nodes which are obsoleted by subsequent + * nodes in this LEB. So instead of calculating clean space, we + * calculate used space ('used' variable). + */ + + list_for_each_entry(snod, &sleb->nodes, list) { + int deletion = 0; + + cond_resched(); + + if (snod->sqnum >= SQNUM_WATERMARK) { + ubifs_err("file system's life ended"); + goto out_dump; + } + + if (snod->sqnum > c->max_sqnum) + c->max_sqnum = snod->sqnum; + + switch (snod->type) { + case UBIFS_INO_NODE: + { + struct ubifs_ino_node *ino = snod->node; + loff_t new_size = le64_to_cpu(ino->size); + + if (le32_to_cpu(ino->nlink) == 0) + deletion = 1; + err = insert_node(c, lnum, snod->offs, snod->len, + &snod->key, snod->sqnum, deletion, + &used, 0, new_size); + break; + } + case UBIFS_DATA_NODE: + { + struct ubifs_data_node *dn = snod->node; + loff_t new_size = le32_to_cpu(dn->size) + + key_block(c, &snod->key) * + UBIFS_BLOCK_SIZE; + + err = insert_node(c, lnum, snod->offs, snod->len, + &snod->key, snod->sqnum, deletion, + &used, 0, new_size); + break; + } + case UBIFS_DENT_NODE: + case UBIFS_XENT_NODE: + { + struct ubifs_dent_node *dent = snod->node; + + err = ubifs_validate_entry(c, dent); + if (err) + goto out_dump; + + err = insert_dent(c, lnum, snod->offs, snod->len, + &snod->key, dent->name, + le16_to_cpu(dent->nlen), snod->sqnum, + !le64_to_cpu(dent->inum), &used); + break; + } + case UBIFS_TRUN_NODE: + { + struct ubifs_trun_node *trun = snod->node; + loff_t old_size = le64_to_cpu(trun->old_size); + loff_t new_size = le64_to_cpu(trun->new_size); + union ubifs_key key; + + /* Validate truncation node */ + if (old_size < 0 || old_size > c->max_inode_sz || + new_size < 0 || new_size > c->max_inode_sz || + old_size <= new_size) { + ubifs_err("bad truncation node"); + goto out_dump; + } + + /* + * Create a fake truncation key just to use the same + * functions which expect nodes to have keys. + */ + trun_key_init(c, &key, le32_to_cpu(trun->inum)); + err = insert_node(c, lnum, snod->offs, snod->len, + &key, snod->sqnum, 1, &used, + old_size, new_size); + break; + } + default: + ubifs_err("unexpected node type %d in bud LEB %d:%d", + snod->type, lnum, snod->offs); + err = -EINVAL; + goto out_dump; + } + if (err) + goto out; + } + + bud = ubifs_search_bud(c, lnum); + if (!bud) + BUG(); + + ubifs_assert(sleb->endpt - offs >= used); + ubifs_assert(sleb->endpt % c->min_io_size == 0); + + if (sleb->endpt + c->min_io_size <= c->leb_size && + !(c->vfs_sb->s_flags & MS_RDONLY)) + err = ubifs_wbuf_seek_nolock(&c->jheads[jhead].wbuf, lnum, + sleb->endpt, UBI_SHORTTERM); + + *dirty = sleb->endpt - offs - used; + *free = c->leb_size - sleb->endpt; + +out: + ubifs_scan_destroy(sleb); + return err; + +out_dump: + ubifs_err("bad node is at LEB %d:%d", lnum, snod->offs); + dbg_dump_node(c, snod->node); + ubifs_scan_destroy(sleb); + return -EINVAL; +} + +/** + * insert_ref_node - insert a reference node to the replay tree. + * @c: UBIFS file-system description object + * @lnum: node logical eraseblock number + * @offs: node offset + * @sqnum: sequence number + * @free: amount of free space in bud + * @dirty: amount of dirty space from padding and deletion nodes + * + * This function inserts a reference node to the replay tree and returns zero + * in case of success or a negative error code in case of failure. + */ +static int insert_ref_node(struct ubifs_info *c, int lnum, int offs, + unsigned long long sqnum, int free, int dirty) +{ + struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL; + struct replay_entry *r; + + dbg_mnt("add ref LEB %d:%d", lnum, offs); + while (*p) { + parent = *p; + r = rb_entry(parent, struct replay_entry, rb); + if (sqnum < r->sqnum) { + p = &(*p)->rb_left; + continue; + } else if (sqnum > r->sqnum) { + p = &(*p)->rb_right; + continue; + } + ubifs_err("duplicate sqnum in replay tree"); + return -EINVAL; + } + + r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL); + if (!r) + return -ENOMEM; + + r->lnum = lnum; + r->offs = offs; + r->sqnum = sqnum; + r->flags = REPLAY_REF; + r->free = free; + r->dirty = dirty; + + rb_link_node(&r->rb, parent, p); + rb_insert_color(&r->rb, &c->replay_tree); + return 0; +} + +/** + * replay_buds - replay all buds. + * @c: UBIFS file-system description object + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int replay_buds(struct ubifs_info *c) +{ + struct bud_entry *b; + int err, uninitialized_var(free), uninitialized_var(dirty); + + list_for_each_entry(b, &c->replay_buds, list) { + err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead, + &free, &dirty); + if (err) + return err; + err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum, + free, dirty); + if (err) + return err; + } + + return 0; +} + +/** + * destroy_bud_list - destroy the list of buds to replay. + * @c: UBIFS file-system description object + */ +static void destroy_bud_list(struct ubifs_info *c) +{ + struct bud_entry *b; + + while (!list_empty(&c->replay_buds)) { + b = list_entry(c->replay_buds.next, struct bud_entry, list); + list_del(&b->list); + kfree(b); + } +} + +/** + * add_replay_bud - add a bud to the list of buds to replay. + * @c: UBIFS file-system description object + * @lnum: bud logical eraseblock number to replay + * @offs: bud start offset + * @jhead: journal head to which this bud belongs + * @sqnum: reference node sequence number + * + * This function returns zero in case of success and a negative error code in + * case of failure. + */ +static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead, + unsigned long long sqnum) +{ + struct ubifs_bud *bud; + struct bud_entry *b; + + dbg_mnt("add replay bud LEB %d:%d, head %d", lnum, offs, jhead); + + bud = kmalloc(sizeof(struct ubifs_bud), GFP_KERNEL); + if (!bud) + return -ENOMEM; + + b = kmalloc(sizeof(struct bud_entry), GFP_KERNEL); + if (!b) { + kfree(bud); + return -ENOMEM; + } + + bud->lnum = lnum; + bud->start = offs; + bud->jhead = jhead; + ubifs_add_bud(c, bud); + + b->bud = bud; + b->sqnum = sqnum; + list_add_tail(&b->list, &c->replay_buds); + + return 0; +} + +/** + * validate_ref - validate a reference node. + * @c: UBIFS file-system description object + * @ref: the reference node to validate + * @ref_lnum: LEB number of the reference node + * @ref_offs: reference node offset + * + * This function returns %1 if a bud reference already exists for the LEB. %0 is + * returned if the reference node is new, otherwise %-EINVAL is returned if + * validation failed. + */ +static int validate_ref(struct ubifs_info *c, const struct ubifs_ref_node *ref) +{ + struct ubifs_bud *bud; + int lnum = le32_to_cpu(ref->lnum); + unsigned int offs = le32_to_cpu(ref->offs); + unsigned int jhead = le32_to_cpu(ref->jhead); + + /* + * ref->offs may point to the end of LEB when the journal head points + * to the end of LEB and we write reference node for it during commit. + * So this is why we require 'offs > c->leb_size'. + */ + if (jhead >= c->jhead_cnt || lnum >= c->leb_cnt || + lnum < c->main_first || offs > c->leb_size || + offs & (c->min_io_size - 1)) + return -EINVAL; + + /* Make sure we have not already looked at this bud */ + bud = ubifs_search_bud(c, lnum); + if (bud) { + if (bud->jhead == jhead && bud->start <= offs) + return 1; + ubifs_err("bud at LEB %d:%d was already referred", lnum, offs); + return -EINVAL; + } + + return 0; +} + +/** + * replay_log_leb - replay a log logical eraseblock. + * @c: UBIFS file-system description object + * @lnum: log logical eraseblock to replay + * @offs: offset to start replaying from + * @sbuf: scan buffer + * + * This function replays a log LEB and returns zero in case of success, %1 if + * this is the last LEB in the log, and a negative error code in case of + * failure. + */ +static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) +{ + int err; + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + const struct ubifs_cs_node *node; + + dbg_mnt("replay log LEB %d:%d", lnum, offs); + sleb = ubifs_scan(c, lnum, offs, sbuf); + if (IS_ERR(sleb)) { + if (c->need_recovery) + sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf); + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + } + + if (sleb->nodes_cnt == 0) { + err = 1; + goto out; + } + + node = sleb->buf; + + snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list); + if (c->cs_sqnum == 0) { + /* + * This is the first log LEB we are looking at, make sure that + * the first node is a commit start node. Also record its + * sequence number so that UBIFS can determine where the log + * ends, because all nodes which were have higher sequence + * numbers. + */ + if (snod->type != UBIFS_CS_NODE) { + dbg_err("first log node at LEB %d:%d is not CS node", + lnum, offs); + goto out_dump; + } + if (le64_to_cpu(node->cmt_no) != c->cmt_no) { + dbg_err("first CS node at LEB %d:%d has wrong " + "commit number %llu expected %llu", + lnum, offs, + (unsigned long long)le64_to_cpu(node->cmt_no), + c->cmt_no); + goto out_dump; + } + + c->cs_sqnum = le64_to_cpu(node->ch.sqnum); + dbg_mnt("commit start sqnum %llu", c->cs_sqnum); + } + + if (snod->sqnum < c->cs_sqnum) { + /* + * This means that we reached end of log and now + * look to the older log data, which was already + * committed but the eraseblock was not erased (UBIFS + * only un-maps it). So this basically means we have to + * exit with "end of log" code. + */ + err = 1; + goto out; + } + + /* Make sure the first node sits at offset zero of the LEB */ + if (snod->offs != 0) { + dbg_err("first node is not at zero offset"); + goto out_dump; + } + + list_for_each_entry(snod, &sleb->nodes, list) { + + cond_resched(); + + if (snod->sqnum >= SQNUM_WATERMARK) { + ubifs_err("file system's life ended"); + goto out_dump; + } + + if (snod->sqnum < c->cs_sqnum) { + dbg_err("bad sqnum %llu, commit sqnum %llu", + snod->sqnum, c->cs_sqnum); + goto out_dump; + } + + if (snod->sqnum > c->max_sqnum) + c->max_sqnum = snod->sqnum; + + switch (snod->type) { + case UBIFS_REF_NODE: { + const struct ubifs_ref_node *ref = snod->node; + + err = validate_ref(c, ref); + if (err == 1) + break; /* Already have this bud */ + if (err) + goto out_dump; + + err = add_replay_bud(c, le32_to_cpu(ref->lnum), + le32_to_cpu(ref->offs), + le32_to_cpu(ref->jhead), + snod->sqnum); + if (err) + goto out; + + break; + } + case UBIFS_CS_NODE: + /* Make sure it sits at the beginning of LEB */ + if (snod->offs != 0) { + ubifs_err("unexpected node in log"); + goto out_dump; + } + break; + default: + ubifs_err("unexpected node in log"); + goto out_dump; + } + } + + if (sleb->endpt || c->lhead_offs >= c->leb_size) { + c->lhead_lnum = lnum; + c->lhead_offs = sleb->endpt; + } + + err = !sleb->endpt; +out: + ubifs_scan_destroy(sleb); + return err; + +out_dump: + ubifs_err("log error detected while replying the log at LEB %d:%d", + lnum, offs + snod->offs); + dbg_dump_node(c, snod->node); + ubifs_scan_destroy(sleb); + return -EINVAL; +} + +/** + * take_ihead - update the status of the index head in lprops to 'taken'. + * @c: UBIFS file-system description object + * + * This function returns the amount of free space in the index head LEB or a + * negative error code. + */ +static int take_ihead(struct ubifs_info *c) +{ + const struct ubifs_lprops *lp; + int err, free; + + ubifs_get_lprops(c); + + lp = ubifs_lpt_lookup_dirty(c, c->ihead_lnum); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + free = lp->free; + + lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC, + lp->flags | LPROPS_TAKEN, 0); + if (IS_ERR(lp)) { + err = PTR_ERR(lp); + goto out; + } + + err = free; +out: + ubifs_release_lprops(c); + return err; +} + +/** + * ubifs_replay_journal - replay journal. + * @c: UBIFS file-system description object + * + * This function scans the journal, replays and cleans it up. It makes sure all + * memory data structures related to uncommitted journal are built (dirty TNC + * tree, tree of buds, modified lprops, etc). + */ +int ubifs_replay_journal(struct ubifs_info *c) +{ + int err, i, lnum, offs, free; + void *sbuf = NULL; + + BUILD_BUG_ON(UBIFS_TRUN_KEY > 5); + + /* Update the status of the index head in lprops to 'taken' */ + free = take_ihead(c); + if (free < 0) + return free; /* Error code */ + + if (c->ihead_offs != c->leb_size - free) { + ubifs_err("bad index head LEB %d:%d", c->ihead_lnum, + c->ihead_offs); + return -EINVAL; + } + + sbuf = vmalloc(c->leb_size); + if (!sbuf) + return -ENOMEM; + + dbg_mnt("start replaying the journal"); + + c->replaying = 1; + + lnum = c->ltail_lnum = c->lhead_lnum; + offs = c->lhead_offs; + + for (i = 0; i < c->log_lebs; i++, lnum++) { + if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) { + /* + * The log is logically circular, we reached the last + * LEB, switch to the first one. + */ + lnum = UBIFS_LOG_LNUM; + offs = 0; + } + err = replay_log_leb(c, lnum, offs, sbuf); + if (err == 1) + /* We hit the end of the log */ + break; + if (err) + goto out; + offs = 0; + } + + err = replay_buds(c); + if (err) + goto out; + + err = apply_replay_tree(c); + if (err) + goto out; + + /* + * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable + * to roughly estimate index growth. Things like @c->min_idx_lebs + * depend on it. This means we have to initialize it to make sure + * budgeting works properly. + */ + c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt); + c->budg_uncommitted_idx *= c->max_idx_node_sz; + + ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery); + dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, " + "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum, + (unsigned long)c->highest_inum); +out: + destroy_replay_tree(c); + destroy_bud_list(c); + vfree(sbuf); + c->replaying = 0; + return err; +} diff -Nurd linux-2.6.24/fs/ubifs/sb.c ubifs-v2.6.24/fs/ubifs/sb.c --- linux-2.6.24/fs/ubifs/sb.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/sb.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,656 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS superblock. The superblock is stored at the first + * LEB of the volume and is never changed by UBIFS. Only user-space tools may + * change it. The superblock node mostly contains geometry information. + */ + +#include "ubifs.h" +#include + +/* + * Default journal size in logical eraseblocks as a percent of total + * flash size. + */ +#define DEFAULT_JNL_PERCENT 5 + +/* Default maximum journal size in bytes */ +#define DEFAULT_MAX_JNL (32*1024*1024) + +/* Default indexing tree fanout */ +#define DEFAULT_FANOUT 8 + +/* Default number of data journal heads */ +#define DEFAULT_JHEADS_CNT 1 + +/* Default positions of different LEBs in the main area */ +#define DEFAULT_IDX_LEB 0 +#define DEFAULT_DATA_LEB 1 +#define DEFAULT_GC_LEB 2 + +/* Default number of LEB numbers in LPT's save table */ +#define DEFAULT_LSAVE_CNT 256 + +/* Default reserved pool size as a percent of maximum free space */ +#define DEFAULT_RP_PERCENT 5 + +/* The default maximum size of reserved pool in bytes */ +#define DEFAULT_MAX_RP_SIZE (5*1024*1024) + +/* Default UBIFS compressor */ +#define DEFAULT_COMPRESSOR UBIFS_COMPR_LZO + +/* Default time granularity in nanoseconds */ +#define DEFAULT_TIME_GRAN 1000000000 + +/** + * create_default_filesystem - format empty UBI volume. + * @c: UBIFS file-system description object + * + * This function creates default empty file-system. Returns zero in case of + * success and a negative error code in case of failure. + */ +static int create_default_filesystem(struct ubifs_info *c) +{ + struct ubifs_sb_node *sup; + struct ubifs_mst_node *mst; + struct ubifs_idx_node *idx; + struct ubifs_branch *br; + struct ubifs_ino_node *ino; + struct ubifs_cs_node *cs; + union ubifs_key key; + int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first; + int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0; + int min_leb_cnt = UBIFS_MIN_LEB_CNT; + long long tmp64, main_bytes; + __le64 tmp_le64; + + /* Some functions called from here depend on the @c->key_len filed */ + c->key_len = UBIFS_SK_LEN; + + /* + * First of all, we have to calculate default file-system geometry - + * log size, journal size, etc. + */ + if (c->leb_cnt < 0x7FFFFFFF / DEFAULT_JNL_PERCENT) + /* We can first multiply then divide and have no overflow */ + jnl_lebs = c->leb_cnt * DEFAULT_JNL_PERCENT / 100; + else + jnl_lebs = (c->leb_cnt / 100) * DEFAULT_JNL_PERCENT; + + if (jnl_lebs < UBIFS_MIN_JNL_LEBS) + jnl_lebs = UBIFS_MIN_JNL_LEBS; + if (jnl_lebs * c->leb_size > DEFAULT_MAX_JNL) + jnl_lebs = DEFAULT_MAX_JNL / c->leb_size; + + /* + * The log should be large enough to fit reference nodes for all bud + * LEBs. Because buds do not have to start from the beginning of LEBs + * (half of the LEB may contain committed data), the log should + * generally be larger, make it twice as large. + */ + tmp = 2 * (c->ref_node_alsz * jnl_lebs) + c->leb_size - 1; + log_lebs = tmp / c->leb_size; + /* Plus one LEB reserved for commit */ + log_lebs += 1; + if (c->leb_cnt - min_leb_cnt > 8) { + /* And some extra space to allow writes while committing */ + log_lebs += 1; + min_leb_cnt += 1; + } + + max_buds = jnl_lebs - log_lebs; + if (max_buds < UBIFS_MIN_BUD_LEBS) + max_buds = UBIFS_MIN_BUD_LEBS; + + /* + * Orphan nodes are stored in a separate area. One node can store a lot + * of orphan inode numbers, but when new orphan comes we just add a new + * orphan node. At some point the nodes are consolidated into one + * orphan node. + */ + orph_lebs = UBIFS_MIN_ORPH_LEBS; +#ifdef CONFIG_UBIFS_FS_DEBUG + if (c->leb_cnt - min_leb_cnt > 1) + /* + * For debugging purposes it is better to have at least 2 + * orphan LEBs, because the orphan subsystem would need to do + * consolidations and would be stressed more. + */ + orph_lebs += 1; +#endif + + main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs; + main_lebs -= orph_lebs; + + lpt_first = UBIFS_LOG_LNUM + log_lebs; + c->lsave_cnt = DEFAULT_LSAVE_CNT; + c->max_leb_cnt = c->leb_cnt; + err = ubifs_create_dflt_lpt(c, &main_lebs, lpt_first, &lpt_lebs, + &big_lpt); + if (err) + return err; + + dbg_gen("LEB Properties Tree created (LEBs %d-%d)", lpt_first, + lpt_first + lpt_lebs - 1); + + main_first = c->leb_cnt - main_lebs; + + /* Create default superblock */ + tmp = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size); + sup = kzalloc(tmp, GFP_KERNEL); + if (!sup) + return -ENOMEM; + + tmp64 = (long long)max_buds * c->leb_size; + if (big_lpt) + sup_flags |= UBIFS_FLG_BIGLPT; + + sup->ch.node_type = UBIFS_SB_NODE; + sup->key_hash = UBIFS_KEY_HASH_R5; + sup->flags = cpu_to_le32(sup_flags); + sup->min_io_size = cpu_to_le32(c->min_io_size); + sup->leb_size = cpu_to_le32(c->leb_size); + sup->leb_cnt = cpu_to_le32(c->leb_cnt); + sup->max_leb_cnt = cpu_to_le32(c->max_leb_cnt); + sup->max_bud_bytes = cpu_to_le64(tmp64); + sup->log_lebs = cpu_to_le32(log_lebs); + sup->lpt_lebs = cpu_to_le32(lpt_lebs); + sup->orph_lebs = cpu_to_le32(orph_lebs); + sup->jhead_cnt = cpu_to_le32(DEFAULT_JHEADS_CNT); + sup->fanout = cpu_to_le32(DEFAULT_FANOUT); + sup->lsave_cnt = cpu_to_le32(c->lsave_cnt); + sup->fmt_version = cpu_to_le32(UBIFS_FORMAT_VERSION); + sup->time_gran = cpu_to_le32(DEFAULT_TIME_GRAN); + if (c->mount_opts.override_compr) + sup->default_compr = cpu_to_le16(c->mount_opts.compr_type); + else + sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO); + + generate_random_uuid(sup->uuid); + + main_bytes = (long long)main_lebs * c->leb_size; + tmp64 = div_u64(main_bytes * DEFAULT_RP_PERCENT, 100); + if (tmp64 > DEFAULT_MAX_RP_SIZE) + tmp64 = DEFAULT_MAX_RP_SIZE; + sup->rp_size = cpu_to_le64(tmp64); + sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION); + + err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM); + kfree(sup); + if (err) + return err; + + dbg_gen("default superblock created at LEB 0:0"); + + /* Create default master node */ + mst = kzalloc(c->mst_node_alsz, GFP_KERNEL); + if (!mst) + return -ENOMEM; + + mst->ch.node_type = UBIFS_MST_NODE; + mst->log_lnum = cpu_to_le32(UBIFS_LOG_LNUM); + mst->highest_inum = cpu_to_le64(UBIFS_FIRST_INO); + mst->cmt_no = 0; + mst->root_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB); + mst->root_offs = 0; + tmp = ubifs_idx_node_sz(c, 1); + mst->root_len = cpu_to_le32(tmp); + mst->gc_lnum = cpu_to_le32(main_first + DEFAULT_GC_LEB); + mst->ihead_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB); + mst->ihead_offs = cpu_to_le32(ALIGN(tmp, c->min_io_size)); + mst->index_size = cpu_to_le64(ALIGN(tmp, 8)); + mst->lpt_lnum = cpu_to_le32(c->lpt_lnum); + mst->lpt_offs = cpu_to_le32(c->lpt_offs); + mst->nhead_lnum = cpu_to_le32(c->nhead_lnum); + mst->nhead_offs = cpu_to_le32(c->nhead_offs); + mst->ltab_lnum = cpu_to_le32(c->ltab_lnum); + mst->ltab_offs = cpu_to_le32(c->ltab_offs); + mst->lsave_lnum = cpu_to_le32(c->lsave_lnum); + mst->lsave_offs = cpu_to_le32(c->lsave_offs); + mst->lscan_lnum = cpu_to_le32(main_first); + mst->empty_lebs = cpu_to_le32(main_lebs - 2); + mst->idx_lebs = cpu_to_le32(1); + mst->leb_cnt = cpu_to_le32(c->leb_cnt); + + /* Calculate lprops statistics */ + tmp64 = main_bytes; + tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size); + tmp64 -= ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size); + mst->total_free = cpu_to_le64(tmp64); + + tmp64 = ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size); + ino_waste = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size) - + UBIFS_INO_NODE_SZ; + tmp64 += ino_waste; + tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), 8); + mst->total_dirty = cpu_to_le64(tmp64); + + /* The indexing LEB does not contribute to dark space */ + tmp64 = (c->main_lebs - 1) * c->dark_wm; + mst->total_dark = cpu_to_le64(tmp64); + + mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ); + + err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0, + UBI_UNKNOWN); + if (err) { + kfree(mst); + return err; + } + err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, 0, + UBI_UNKNOWN); + kfree(mst); + if (err) + return err; + + dbg_gen("default master node created at LEB %d:0", UBIFS_MST_LNUM); + + /* Create the root indexing node */ + tmp = ubifs_idx_node_sz(c, 1); + idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL); + if (!idx) + return -ENOMEM; + + c->key_fmt = UBIFS_SIMPLE_KEY_FMT; + c->key_hash = key_r5_hash; + + idx->ch.node_type = UBIFS_IDX_NODE; + idx->child_cnt = cpu_to_le16(1); + ino_key_init(c, &key, UBIFS_ROOT_INO); + br = ubifs_idx_branch(c, idx, 0); + key_write_idx(c, &key, &br->key); + br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB); + br->len = cpu_to_le32(UBIFS_INO_NODE_SZ); + err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0, + UBI_UNKNOWN); + kfree(idx); + if (err) + return err; + + dbg_gen("default root indexing node created LEB %d:0", + main_first + DEFAULT_IDX_LEB); + + /* Create default root inode */ + tmp = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size); + ino = kzalloc(tmp, GFP_KERNEL); + if (!ino) + return -ENOMEM; + + ino_key_init_flash(c, &ino->key, UBIFS_ROOT_INO); + ino->ch.node_type = UBIFS_INO_NODE; + ino->creat_sqnum = cpu_to_le64(++c->max_sqnum); + ino->nlink = cpu_to_le32(2); + tmp_le64 = cpu_to_le64(CURRENT_TIME_SEC.tv_sec); + ino->atime_sec = tmp_le64; + ino->ctime_sec = tmp_le64; + ino->mtime_sec = tmp_le64; + ino->atime_nsec = 0; + ino->ctime_nsec = 0; + ino->mtime_nsec = 0; + ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO); + ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ); + + /* Set compression enabled by default */ + ino->flags = cpu_to_le32(UBIFS_COMPR_FL); + + err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ, + main_first + DEFAULT_DATA_LEB, 0, + UBI_UNKNOWN); + kfree(ino); + if (err) + return err; + + dbg_gen("root inode created at LEB %d:0", + main_first + DEFAULT_DATA_LEB); + + /* + * The first node in the log has to be the commit start node. This is + * always the case during normal file-system operation. Write a fake + * commit start node to the log. + */ + tmp = ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size); + cs = kzalloc(tmp, GFP_KERNEL); + if (!cs) + return -ENOMEM; + + cs->ch.node_type = UBIFS_CS_NODE; + err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, + 0, UBI_UNKNOWN); + kfree(cs); + + ubifs_msg("default file-system created"); + return 0; +} + +/** + * validate_sb - validate superblock node. + * @c: UBIFS file-system description object + * @sup: superblock node + * + * This function validates superblock node @sup. Since most of data was read + * from the superblock and stored in @c, the function validates fields in @c + * instead. Returns zero in case of success and %-EINVAL in case of validation + * failure. + */ +static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) +{ + long long max_bytes; + int err = 1, min_leb_cnt; + + if (!c->key_hash) { + err = 2; + goto failed; + } + + if (sup->key_fmt != UBIFS_SIMPLE_KEY_FMT) { + err = 3; + goto failed; + } + + if (le32_to_cpu(sup->min_io_size) != c->min_io_size) { + ubifs_err("min. I/O unit mismatch: %d in superblock, %d real", + le32_to_cpu(sup->min_io_size), c->min_io_size); + goto failed; + } + + if (le32_to_cpu(sup->leb_size) != c->leb_size) { + ubifs_err("LEB size mismatch: %d in superblock, %d real", + le32_to_cpu(sup->leb_size), c->leb_size); + goto failed; + } + + if (c->log_lebs < UBIFS_MIN_LOG_LEBS || + c->lpt_lebs < UBIFS_MIN_LPT_LEBS || + c->orph_lebs < UBIFS_MIN_ORPH_LEBS || + c->main_lebs < UBIFS_MIN_MAIN_LEBS) { + err = 4; + goto failed; + } + + /* + * Calculate minimum allowed amount of main area LEBs. This is very + * similar to %UBIFS_MIN_LEB_CNT, but we take into account real what we + * have just read from the superblock. + */ + min_leb_cnt = UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs; + min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6; + + if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) { + ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, " + "%d minimum required", c->leb_cnt, c->vi.size, + min_leb_cnt); + goto failed; + } + + if (c->max_leb_cnt < c->leb_cnt) { + ubifs_err("max. LEB count %d less than LEB count %d", + c->max_leb_cnt, c->leb_cnt); + goto failed; + } + + if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) { + err = 7; + goto failed; + } + + if (c->max_bud_bytes < (long long)c->leb_size * UBIFS_MIN_BUD_LEBS || + c->max_bud_bytes > (long long)c->leb_size * c->main_lebs) { + err = 8; + goto failed; + } + + if (c->jhead_cnt < NONDATA_JHEADS_CNT + 1 || + c->jhead_cnt > NONDATA_JHEADS_CNT + UBIFS_MAX_JHEADS) { + err = 9; + goto failed; + } + + if (c->fanout < UBIFS_MIN_FANOUT || + ubifs_idx_node_sz(c, c->fanout) > c->leb_size) { + err = 10; + goto failed; + } + + if (c->lsave_cnt < 0 || (c->lsave_cnt > DEFAULT_LSAVE_CNT && + c->lsave_cnt > c->max_leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - + c->log_lebs - c->lpt_lebs - c->orph_lebs)) { + err = 11; + goto failed; + } + + if (UBIFS_SB_LEBS + UBIFS_MST_LEBS + c->log_lebs + c->lpt_lebs + + c->orph_lebs + c->main_lebs != c->leb_cnt) { + err = 12; + goto failed; + } + + if (c->default_compr < 0 || c->default_compr >= UBIFS_COMPR_TYPES_CNT) { + err = 13; + goto failed; + } + + max_bytes = c->main_lebs * (long long)c->leb_size; + if (c->rp_size < 0 || max_bytes < c->rp_size) { + err = 14; + goto failed; + } + + if (le32_to_cpu(sup->time_gran) > 1000000000 || + le32_to_cpu(sup->time_gran) < 1) { + err = 15; + goto failed; + } + + return 0; + +failed: + ubifs_err("bad superblock, error %d", err); + dbg_dump_node(c, sup); + return -EINVAL; +} + +/** + * ubifs_read_sb_node - read superblock node. + * @c: UBIFS file-system description object + * + * This function returns a pointer to the superblock node or a negative error + * code. + */ +struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c) +{ + struct ubifs_sb_node *sup; + int err; + + sup = kmalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_NOFS); + if (!sup) + return ERR_PTR(-ENOMEM); + + err = ubifs_read_node(c, sup, UBIFS_SB_NODE, UBIFS_SB_NODE_SZ, + UBIFS_SB_LNUM, 0); + if (err) { + kfree(sup); + return ERR_PTR(err); + } + + return sup; +} + +/** + * ubifs_write_sb_node - write superblock node. + * @c: UBIFS file-system description object + * @sup: superblock node read with 'ubifs_read_sb_node()' + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup) +{ + int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size); + + ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1); + return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len, UBI_LONGTERM); +} + +/** + * ubifs_read_superblock - read superblock. + * @c: UBIFS file-system description object + * + * This function finds, reads and checks the superblock. If an empty UBI volume + * is being mounted, this function creates default superblock. Returns zero in + * case of success, and a negative error code in case of failure. + */ +int ubifs_read_superblock(struct ubifs_info *c) +{ + int err, sup_flags; + struct ubifs_sb_node *sup; + + if (c->empty) { + err = create_default_filesystem(c); + if (err) + return err; + } + + sup = ubifs_read_sb_node(c); + if (IS_ERR(sup)) + return PTR_ERR(sup); + + c->fmt_version = le32_to_cpu(sup->fmt_version); + c->ro_compat_version = le32_to_cpu(sup->ro_compat_version); + + /* + * The software supports all previous versions but not future versions, + * due to the unavailability of time-travelling equipment. + */ + if (c->fmt_version > UBIFS_FORMAT_VERSION) { + struct super_block *sb = c->vfs_sb; + int mounting_ro = sb->s_flags & MS_RDONLY; + + ubifs_assert(!c->ro_media || mounting_ro); + if (!mounting_ro || + c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) { + ubifs_err("on-flash format version is w%d/r%d, but " + "software only supports up to version " + "w%d/r%d", c->fmt_version, + c->ro_compat_version, UBIFS_FORMAT_VERSION, + UBIFS_RO_COMPAT_VERSION); + if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) { + ubifs_msg("only R/O mounting is possible"); + err = -EROFS; + } else + err = -EINVAL; + goto out; + } + + /* + * The FS is mounted R/O, and the media format is + * R/O-compatible with the UBIFS implementation, so we can + * mount. + */ + c->rw_incompat = 1; + } + + if (c->fmt_version < 3) { + ubifs_err("on-flash format version %d is not supported", + c->fmt_version); + err = -EINVAL; + goto out; + } + + switch (sup->key_hash) { + case UBIFS_KEY_HASH_R5: + c->key_hash = key_r5_hash; + c->key_hash_type = UBIFS_KEY_HASH_R5; + break; + + case UBIFS_KEY_HASH_TEST: + c->key_hash = key_test_hash; + c->key_hash_type = UBIFS_KEY_HASH_TEST; + break; + }; + + c->key_fmt = sup->key_fmt; + + switch (c->key_fmt) { + case UBIFS_SIMPLE_KEY_FMT: + c->key_len = UBIFS_SK_LEN; + break; + default: + ubifs_err("unsupported key format"); + err = -EINVAL; + goto out; + } + + c->leb_cnt = le32_to_cpu(sup->leb_cnt); + c->max_leb_cnt = le32_to_cpu(sup->max_leb_cnt); + c->max_bud_bytes = le64_to_cpu(sup->max_bud_bytes); + c->log_lebs = le32_to_cpu(sup->log_lebs); + c->lpt_lebs = le32_to_cpu(sup->lpt_lebs); + c->orph_lebs = le32_to_cpu(sup->orph_lebs); + c->jhead_cnt = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT; + c->fanout = le32_to_cpu(sup->fanout); + c->lsave_cnt = le32_to_cpu(sup->lsave_cnt); + c->rp_size = le64_to_cpu(sup->rp_size); + c->rp_uid = le32_to_cpu(sup->rp_uid); + c->rp_gid = le32_to_cpu(sup->rp_gid); + sup_flags = le32_to_cpu(sup->flags); + if (!c->mount_opts.override_compr) + c->default_compr = le16_to_cpu(sup->default_compr); + + c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran); + memcpy(&c->uuid, &sup->uuid, 16); + c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT); + + /* Automatically increase file system size to the maximum size */ + c->old_leb_cnt = c->leb_cnt; + if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) { + c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size); + if (c->vfs_sb->s_flags & MS_RDONLY) + dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs", + c->old_leb_cnt, c->leb_cnt); + else { + dbg_mnt("Auto resizing (sb) from %d LEBs to %d LEBs", + c->old_leb_cnt, c->leb_cnt); + sup->leb_cnt = cpu_to_le32(c->leb_cnt); + err = ubifs_write_sb_node(c, sup); + if (err) + goto out; + c->old_leb_cnt = c->leb_cnt; + } + } + + c->log_bytes = (long long)c->log_lebs * c->leb_size; + c->log_last = UBIFS_LOG_LNUM + c->log_lebs - 1; + c->lpt_first = UBIFS_LOG_LNUM + c->log_lebs; + c->lpt_last = c->lpt_first + c->lpt_lebs - 1; + c->orph_first = c->lpt_last + 1; + c->orph_last = c->orph_first + c->orph_lebs - 1; + c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS; + c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs; + c->main_first = c->leb_cnt - c->main_lebs; + + err = validate_sb(c, sup); +out: + kfree(sup); + return err; +} diff -Nurd linux-2.6.24/fs/ubifs/scan.c ubifs-v2.6.24/fs/ubifs/scan.c --- linux-2.6.24/fs/ubifs/scan.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/scan.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,362 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements the scan which is a general-purpose function for + * determining what nodes are in an eraseblock. The scan is used to replay the + * journal, to do garbage collection. for the TNC in-the-gaps method, and by + * debugging functions. + */ + +#include "ubifs.h" + +/** + * scan_padding_bytes - scan for padding bytes. + * @buf: buffer to scan + * @len: length of buffer + * + * This function returns the number of padding bytes on success and + * %SCANNED_GARBAGE on failure. + */ +static int scan_padding_bytes(void *buf, int len) +{ + int pad_len = 0, max_pad_len = min_t(int, UBIFS_PAD_NODE_SZ, len); + uint8_t *p = buf; + + dbg_scan("not a node"); + + while (pad_len < max_pad_len && *p++ == UBIFS_PADDING_BYTE) + pad_len += 1; + + if (!pad_len || (pad_len & 7)) + return SCANNED_GARBAGE; + + dbg_scan("%d padding bytes", pad_len); + + return pad_len; +} + +/** + * ubifs_scan_a_node - scan for a node or padding. + * @c: UBIFS file-system description object + * @buf: buffer to scan + * @len: length of buffer + * @lnum: logical eraseblock number + * @offs: offset within the logical eraseblock + * @quiet: print no messages + * + * This function returns a scanning code to indicate what was scanned. + */ +int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, + int offs, int quiet) +{ + struct ubifs_ch *ch = buf; + uint32_t magic; + + magic = le32_to_cpu(ch->magic); + + if (magic == 0xFFFFFFFF) { + dbg_scan("hit empty space"); + return SCANNED_EMPTY_SPACE; + } + + if (magic != UBIFS_NODE_MAGIC) + return scan_padding_bytes(buf, len); + + if (len < UBIFS_CH_SZ) + return SCANNED_GARBAGE; + + dbg_scan("scanning %s", dbg_ntype(ch->node_type)); + + if (ubifs_check_node(c, buf, lnum, offs, quiet, 1)) + return SCANNED_A_CORRUPT_NODE; + + if (ch->node_type == UBIFS_PAD_NODE) { + struct ubifs_pad_node *pad = buf; + int pad_len = le32_to_cpu(pad->pad_len); + int node_len = le32_to_cpu(ch->len); + + /* Validate the padding node */ + if (pad_len < 0 || + offs + node_len + pad_len > c->leb_size) { + if (!quiet) { + ubifs_err("bad pad node at LEB %d:%d", + lnum, offs); + dbg_dump_node(c, pad); + } + return SCANNED_A_BAD_PAD_NODE; + } + + /* Make the node pads to 8-byte boundary */ + if ((node_len + pad_len) & 7) { + if (!quiet) { + dbg_err("bad padding length %d - %d", + offs, offs + node_len + pad_len); + } + return SCANNED_A_BAD_PAD_NODE; + } + + dbg_scan("%d bytes padded, offset now %d", + pad_len, ALIGN(offs + node_len + pad_len, 8)); + + return node_len + pad_len; + } + + return SCANNED_A_NODE; +} + +/** + * ubifs_start_scan - create LEB scanning information at start of scan. + * @c: UBIFS file-system description object + * @lnum: logical eraseblock number + * @offs: offset to start at (usually zero) + * @sbuf: scan buffer (must be c->leb_size) + * + * This function returns %0 on success and a negative error code on failure. + */ +struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, + int offs, void *sbuf) +{ + struct ubifs_scan_leb *sleb; + int err; + + dbg_scan("scan LEB %d:%d", lnum, offs); + + sleb = kzalloc(sizeof(struct ubifs_scan_leb), GFP_NOFS); + if (!sleb) + return ERR_PTR(-ENOMEM); + + sleb->lnum = lnum; + INIT_LIST_HEAD(&sleb->nodes); + sleb->buf = sbuf; + + err = ubi_read(c->ubi, lnum, sbuf + offs, offs, c->leb_size - offs); + if (err && err != -EBADMSG) { + ubifs_err("cannot read %d bytes from LEB %d:%d," + " error %d", c->leb_size - offs, lnum, offs, err); + kfree(sleb); + return ERR_PTR(err); + } + + if (err == -EBADMSG) + sleb->ecc = 1; + + return sleb; +} + +/** + * ubifs_end_scan - update LEB scanning information at end of scan. + * @c: UBIFS file-system description object + * @sleb: scanning information + * @lnum: logical eraseblock number + * @offs: offset to start at (usually zero) + * + * This function returns %0 on success and a negative error code on failure. + */ +void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, + int lnum, int offs) +{ + lnum = lnum; + dbg_scan("stop scanning LEB %d at offset %d", lnum, offs); + ubifs_assert(offs % c->min_io_size == 0); + + sleb->endpt = ALIGN(offs, c->min_io_size); +} + +/** + * ubifs_add_snod - add a scanned node to LEB scanning information. + * @c: UBIFS file-system description object + * @sleb: scanning information + * @buf: buffer containing node + * @offs: offset of node on flash + * + * This function returns %0 on success and a negative error code on failure. + */ +int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, + void *buf, int offs) +{ + struct ubifs_ch *ch = buf; + struct ubifs_ino_node *ino = buf; + struct ubifs_scan_node *snod; + + snod = kzalloc(sizeof(struct ubifs_scan_node), GFP_NOFS); + if (!snod) + return -ENOMEM; + + snod->sqnum = le64_to_cpu(ch->sqnum); + snod->type = ch->node_type; + snod->offs = offs; + snod->len = le32_to_cpu(ch->len); + snod->node = buf; + + switch (ch->node_type) { + case UBIFS_INO_NODE: + case UBIFS_DENT_NODE: + case UBIFS_XENT_NODE: + case UBIFS_DATA_NODE: + case UBIFS_TRUN_NODE: + /* + * The key is in the same place in all keyed + * nodes. + */ + key_read(c, &ino->key, &snod->key); + break; + } + list_add_tail(&snod->list, &sleb->nodes); + sleb->nodes_cnt += 1; + return 0; +} + +/** + * ubifs_scanned_corruption - print information after UBIFS scanned corruption. + * @c: UBIFS file-system description object + * @lnum: LEB number of corruption + * @offs: offset of corruption + * @buf: buffer containing corruption + */ +void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, + void *buf) +{ + int len; + + ubifs_err("corrupted data at LEB %d:%d", lnum, offs); + if (dbg_failure_mode) + return; + len = c->leb_size - offs; + if (len > 4096) + len = 4096; + dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs); + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1); +} + +/** + * ubifs_scan - scan a logical eraseblock. + * @c: UBIFS file-system description object + * @lnum: logical eraseblock number + * @offs: offset to start at (usually zero) + * @sbuf: scan buffer (must be c->leb_size) + * + * This function scans LEB number @lnum and returns complete information about + * its contents. Returns an error code in case of failure. + */ +struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, + int offs, void *sbuf) +{ + void *buf = sbuf + offs; + int err, len = c->leb_size - offs; + struct ubifs_scan_leb *sleb; + + sleb = ubifs_start_scan(c, lnum, offs, sbuf); + if (IS_ERR(sleb)) + return sleb; + + while (len >= 8) { + struct ubifs_ch *ch = buf; + int node_len, ret; + + dbg_scan("look at LEB %d:%d (%d bytes left)", + lnum, offs, len); + + cond_resched(); + + ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 0); + + if (ret > 0) { + /* Padding bytes or a valid padding node */ + offs += ret; + buf += ret; + len -= ret; + continue; + } + + if (ret == SCANNED_EMPTY_SPACE) + /* Empty space is checked later */ + break; + + switch (ret) { + case SCANNED_GARBAGE: + dbg_err("garbage"); + goto corrupted; + case SCANNED_A_NODE: + break; + case SCANNED_A_CORRUPT_NODE: + case SCANNED_A_BAD_PAD_NODE: + dbg_err("bad node"); + goto corrupted; + default: + dbg_err("unknown"); + goto corrupted; + } + + err = ubifs_add_snod(c, sleb, buf, offs); + if (err) + goto error; + + node_len = ALIGN(le32_to_cpu(ch->len), 8); + offs += node_len; + buf += node_len; + len -= node_len; + } + + if (offs % c->min_io_size) + goto corrupted; + + ubifs_end_scan(c, sleb, lnum, offs); + + for (; len > 4; offs += 4, buf = buf + 4, len -= 4) + if (*(uint32_t *)buf != 0xffffffff) + break; + for (; len; offs++, buf++, len--) + if (*(uint8_t *)buf != 0xff) { + ubifs_err("corrupt empty space at LEB %d:%d", + lnum, offs); + goto corrupted; + } + + return sleb; + +corrupted: + ubifs_scanned_corruption(c, lnum, offs, buf); + err = -EUCLEAN; +error: + ubifs_err("LEB %d scanning failed", lnum); + ubifs_scan_destroy(sleb); + return ERR_PTR(err); +} + +/** + * ubifs_scan_destroy - destroy LEB scanning information. + * @sleb: scanning information to free + */ +void ubifs_scan_destroy(struct ubifs_scan_leb *sleb) +{ + struct ubifs_scan_node *node; + struct list_head *head; + + head = &sleb->nodes; + while (!list_empty(head)) { + node = list_entry(head->next, struct ubifs_scan_node, list); + list_del(&node->list); + kfree(node); + } + kfree(sleb); +} diff -Nurd linux-2.6.24/fs/ubifs/shrinker.c ubifs-v2.6.24/fs/ubifs/shrinker.c --- linux-2.6.24/fs/ubifs/shrinker.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/shrinker.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,320 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS shrinker which evicts clean znodes from the TNC + * tree when Linux VM needs more RAM. + * + * We do not implement any LRU lists to find oldest znodes to free because it + * would add additional overhead to the file system fast paths. So the shrinker + * just walks the TNC tree when searching for znodes to free. + * + * If the root of a TNC sub-tree is clean and old enough, then the children are + * also clean and old enough. So the shrinker walks the TNC in level order and + * dumps entire sub-trees. + * + * The age of znodes is just the time-stamp when they were last looked at. + * The current shrinker first tries to evict old znodes, then young ones. + * + * Since the shrinker is global, it has to protect against races with FS + * un-mounts, which is done by the 'ubifs_infos_lock' and 'c->umount_mutex'. + */ + +#include "ubifs.h" + +/* List of all UBIFS file-system instances */ +LIST_HEAD(ubifs_infos); + +/* + * We number each shrinker run and record the number on the ubifs_info structure + * so that we can easily work out which ubifs_info structures have already been + * done by the current run. + */ +static unsigned int shrinker_run_no; + +/* Protects 'ubifs_infos' list */ +DEFINE_SPINLOCK(ubifs_infos_lock); + +/* Global clean znode counter (for all mounted UBIFS instances) */ +atomic_long_t ubifs_clean_zn_cnt; + +/** + * shrink_tnc - shrink TNC tree. + * @c: UBIFS file-system description object + * @nr: number of znodes to free + * @age: the age of znodes to free + * @contention: if any contention, this is set to %1 + * + * This function traverses TNC tree and frees clean znodes. It does not free + * clean znodes which younger then @age. Returns number of freed znodes. + */ +static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention) +{ + int total_freed = 0; + struct ubifs_znode *znode, *zprev; + int time = get_seconds(); + + ubifs_assert(mutex_is_locked(&c->umount_mutex)); + ubifs_assert(mutex_is_locked(&c->tnc_mutex)); + + if (!c->zroot.znode || atomic_long_read(&c->clean_zn_cnt) == 0) + return 0; + + /* + * Traverse the TNC tree in levelorder manner, so that it is possible + * to destroy large sub-trees. Indeed, if a znode is old, then all its + * children are older or of the same age. + * + * Note, we are holding 'c->tnc_mutex', so we do not have to lock the + * 'c->space_lock' when _reading_ 'c->clean_zn_cnt', because it is + * changed only when the 'c->tnc_mutex' is held. + */ + zprev = NULL; + znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL); + while (znode && total_freed < nr && + atomic_long_read(&c->clean_zn_cnt) > 0) { + int freed; + + /* + * If the znode is clean, but it is in the 'c->cnext' list, this + * means that this znode has just been written to flash as a + * part of commit and was marked clean. They will be removed + * from the list at end commit. We cannot change the list, + * because it is not protected by any mutex (design decision to + * make commit really independent and parallel to main I/O). So + * we just skip these znodes. + * + * Note, the 'clean_zn_cnt' counters are not updated until + * after the commit, so the UBIFS shrinker does not report + * the znodes which are in the 'c->cnext' list as freeable. + * + * Also note, if the root of a sub-tree is not in 'c->cnext', + * then the whole sub-tree is not in 'c->cnext' as well, so it + * is safe to dump whole sub-tree. + */ + + if (znode->cnext) { + /* + * Very soon these znodes will be removed from the list + * and become freeable. + */ + *contention = 1; + } else if (!ubifs_zn_dirty(znode) && + abs(time - znode->time) >= age) { + if (znode->parent) + znode->parent->zbranch[znode->iip].znode = NULL; + else + c->zroot.znode = NULL; + + freed = ubifs_destroy_tnc_subtree(znode); + atomic_long_sub(freed, &ubifs_clean_zn_cnt); + atomic_long_sub(freed, &c->clean_zn_cnt); + ubifs_assert(atomic_long_read(&c->clean_zn_cnt) >= 0); + total_freed += freed; + znode = zprev; + } + + if (unlikely(!c->zroot.znode)) + break; + + zprev = znode; + znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode); + cond_resched(); + } + + return total_freed; +} + +/** + * shrink_tnc_trees - shrink UBIFS TNC trees. + * @nr: number of znodes to free + * @age: the age of znodes to free + * @contention: if any contention, this is set to %1 + * + * This function walks the list of mounted UBIFS file-systems and frees clean + * znodes which are older then @age, until at least @nr znodes are freed. + * Returns the number of freed znodes. + */ +static int shrink_tnc_trees(int nr, int age, int *contention) +{ + struct ubifs_info *c; + struct list_head *p; + unsigned int run_no; + int freed = 0; + + spin_lock(&ubifs_infos_lock); + do { + run_no = ++shrinker_run_no; + } while (run_no == 0); + /* Iterate over all mounted UBIFS file-systems and try to shrink them */ + p = ubifs_infos.next; + while (p != &ubifs_infos) { + c = list_entry(p, struct ubifs_info, infos_list); + /* + * We move the ones we do to the end of the list, so we stop + * when we see one we have already done. + */ + if (c->shrinker_run_no == run_no) + break; + if (!mutex_trylock(&c->umount_mutex)) { + /* Some un-mount is in progress, try next FS */ + *contention = 1; + p = p->next; + continue; + } + /* + * We're holding 'c->umount_mutex', so the file-system won't go + * away. + */ + if (!mutex_trylock(&c->tnc_mutex)) { + mutex_unlock(&c->umount_mutex); + *contention = 1; + p = p->next; + continue; + } + spin_unlock(&ubifs_infos_lock); + /* + * OK, now we have TNC locked, the file-system cannot go away - + * it is safe to reap the cache. + */ + c->shrinker_run_no = run_no; + freed += shrink_tnc(c, nr, age, contention); + mutex_unlock(&c->tnc_mutex); + spin_lock(&ubifs_infos_lock); + /* Get the next list element before we move this one */ + p = p->next; + /* + * Move this one to the end of the list to provide some + * fairness. + */ + list_move_tail(&c->infos_list, &ubifs_infos); + mutex_unlock(&c->umount_mutex); + if (freed >= nr) + break; + } + spin_unlock(&ubifs_infos_lock); + return freed; +} + +/** + * kick_a_thread - kick a background thread to start commit. + * + * This function kicks a background thread to start background commit. Returns + * %-1 if a thread was kicked or there is another reason to assume the memory + * will soon be freed or become freeable. If there are no dirty znodes, returns + * %0. + */ +static int kick_a_thread(void) +{ + int i; + struct ubifs_info *c; + + /* + * Iterate over all mounted UBIFS file-systems and find out if there is + * already an ongoing commit operation there. If no, then iterate for + * the second time and initiate background commit. + */ + spin_lock(&ubifs_infos_lock); + for (i = 0; i < 2; i++) { + list_for_each_entry(c, &ubifs_infos, infos_list) { + long dirty_zn_cnt; + + if (!mutex_trylock(&c->umount_mutex)) { + /* + * Some un-mount is in progress, it will + * certainly free memory, so just return. + */ + spin_unlock(&ubifs_infos_lock); + return -1; + } + + dirty_zn_cnt = atomic_long_read(&c->dirty_zn_cnt); + + if (!dirty_zn_cnt || c->cmt_state == COMMIT_BROKEN || + c->ro_media) { + mutex_unlock(&c->umount_mutex); + continue; + } + + if (c->cmt_state != COMMIT_RESTING) { + spin_unlock(&ubifs_infos_lock); + mutex_unlock(&c->umount_mutex); + return -1; + } + + if (i == 1) { + list_move_tail(&c->infos_list, &ubifs_infos); + spin_unlock(&ubifs_infos_lock); + + ubifs_request_bg_commit(c); + mutex_unlock(&c->umount_mutex); + return -1; + } + mutex_unlock(&c->umount_mutex); + } + } + spin_unlock(&ubifs_infos_lock); + + return 0; +} + +int ubifs_shrinker(int nr, gfp_t gfp_mask) +{ + int freed, contention = 0; + long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt); + + if (nr == 0) + return clean_zn_cnt; + + if (!clean_zn_cnt) { + /* + * No clean znodes, nothing to reap. All we can do in this case + * is to kick background threads to start commit, which will + * probably make clean znodes which, in turn, will be freeable. + * And we return -1 which means will make VM call us again + * later. + */ + dbg_tnc("no clean znodes, kick a thread"); + return kick_a_thread(); + } + + freed = shrink_tnc_trees(nr, OLD_ZNODE_AGE, &contention); + if (freed >= nr) + goto out; + + dbg_tnc("not enough old znodes, try to free young ones"); + freed += shrink_tnc_trees(nr - freed, YOUNG_ZNODE_AGE, &contention); + if (freed >= nr) + goto out; + + dbg_tnc("not enough young znodes, free all"); + freed += shrink_tnc_trees(nr - freed, 0, &contention); + + if (!freed && contention) { + dbg_tnc("freed nothing, but contention"); + return -1; + } + +out: + dbg_tnc("%d znodes were freed, requested %d", freed, nr); + return freed; +} diff -Nurd linux-2.6.24/fs/ubifs/super.c ubifs-v2.6.24/fs/ubifs/super.c --- linux-2.6.24/fs/ubifs/super.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/super.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,2209 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS initialization and VFS superblock operations. Some + * initialization stuff which is rather large and complex is placed at + * corresponding subsystems, but most of it is here. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ubifs.h" + +/* + * Maximum amount of memory we may 'kmalloc()' without worrying that we are + * allocating too much. + */ +#define UBIFS_KMALLOC_OK (128*1024) + +/* Slab cache for UBIFS inodes */ +struct kmem_cache *ubifs_inode_slab; + +#ifndef UBIFS_COMPAT_NO_SHRINKER +/* UBIFS TNC shrinker description */ +static struct shrinker ubifs_shrinker_info = { + .shrink = ubifs_shrinker, + .seeks = DEFAULT_SEEKS, +}; +#endif + +/** + * validate_inode - validate inode. + * @c: UBIFS file-system description object + * @inode: the inode to validate + * + * This is a helper function for 'ubifs_iget()' which validates various fields + * of a newly built inode to make sure they contain sane values and prevent + * possible vulnerabilities. Returns zero if the inode is all right and + * a non-zero error code if not. + */ +static int validate_inode(struct ubifs_info *c, const struct inode *inode) +{ + int err; + const struct ubifs_inode *ui = ubifs_inode(inode); + + if (inode->i_size > c->max_inode_sz) { + ubifs_err("inode is too large (%lld)", + (long long)inode->i_size); + return 1; + } + + if (ui->compr_type < 0 || ui->compr_type >= UBIFS_COMPR_TYPES_CNT) { + ubifs_err("unknown compression type %d", ui->compr_type); + return 2; + } + + if (ui->xattr_names + ui->xattr_cnt > XATTR_LIST_MAX) + return 3; + + if (ui->data_len < 0 || ui->data_len > UBIFS_MAX_INO_DATA) + return 4; + + if (ui->xattr && (inode->i_mode & S_IFMT) != S_IFREG) + return 5; + + if (!ubifs_compr_present(ui->compr_type)) { + ubifs_warn("inode %lu uses '%s' compression, but it was not " + "compiled in", inode->i_ino, + ubifs_compr_name(ui->compr_type)); + } + + err = dbg_check_dir_size(c, inode); + return err; +} + +struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) +{ + struct inode *inode; + + inode = iget(sb, inum); + if (!inode) { + make_bad_inode(inode); + return ERR_PTR(-EINVAL); + } + + return inode; +} + +void ubifs_read_inode(struct inode *inode) +{ + int err; + union ubifs_key key; + struct ubifs_ino_node *ino; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + + dbg_gen("inode %lu", inode->i_ino); + ubifs_assert(inode->i_state & I_LOCK); + + ino = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS); + if (!ino) { + err = -ENOMEM; + goto out; + } + + ino_key_init(c, &key, inode->i_ino); + + err = ubifs_tnc_lookup(c, &key, ino); + if (err) + goto out_ino; + + inode->i_flags |= (S_NOCMTIME | S_NOATIME); + inode->i_nlink = le32_to_cpu(ino->nlink); + inode->i_uid = le32_to_cpu(ino->uid); + inode->i_gid = le32_to_cpu(ino->gid); + inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec); + inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec); + inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec); + inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec); + inode->i_ctime.tv_sec = (int64_t)le64_to_cpu(ino->ctime_sec); + inode->i_ctime.tv_nsec = le32_to_cpu(ino->ctime_nsec); + inode->i_mode = le32_to_cpu(ino->mode); + inode->i_size = le64_to_cpu(ino->size); + + ui->data_len = le32_to_cpu(ino->data_len); + ui->flags = le32_to_cpu(ino->flags); + ui->compr_type = le16_to_cpu(ino->compr_type); + ui->creat_sqnum = le64_to_cpu(ino->creat_sqnum); + ui->xattr_cnt = le32_to_cpu(ino->xattr_cnt); + ui->xattr_size = le32_to_cpu(ino->xattr_size); + ui->xattr_names = le32_to_cpu(ino->xattr_names); + ui->synced_i_size = ui->ui_size = inode->i_size; + + ui->xattr = (ui->flags & UBIFS_XATTR_FL) ? 1 : 0; + + err = validate_inode(c, inode); + if (err) + goto out_invalid; + + /* Disable read-ahead */ + inode->i_mapping->backing_dev_info = &c->bdi; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_mapping->a_ops = &ubifs_file_address_operations; + inode->i_op = &ubifs_file_inode_operations; + inode->i_fop = &ubifs_file_operations; + if (ui->xattr) { + ui->data = kmalloc(ui->data_len + 1, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_ino; + } + memcpy(ui->data, ino->data, ui->data_len); + ((char *)ui->data)[ui->data_len] = '\0'; + } else if (ui->data_len != 0) { + err = 10; + goto out_invalid; + } + break; + case S_IFDIR: + inode->i_op = &ubifs_dir_inode_operations; + inode->i_fop = &ubifs_dir_operations; + if (ui->data_len != 0) { + err = 11; + goto out_invalid; + } + break; + case S_IFLNK: + inode->i_op = &ubifs_symlink_inode_operations; + if (ui->data_len <= 0 || ui->data_len > UBIFS_MAX_INO_DATA) { + err = 12; + goto out_invalid; + } + ui->data = kmalloc(ui->data_len + 1, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_ino; + } + memcpy(ui->data, ino->data, ui->data_len); + ((char *)ui->data)[ui->data_len] = '\0'; + break; + case S_IFBLK: + case S_IFCHR: + { + dev_t rdev; + union ubifs_dev_desc *dev; + + ui->data = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_ino; + } + + dev = (union ubifs_dev_desc *)ino->data; + if (ui->data_len == sizeof(dev->new)) + rdev = new_decode_dev(le32_to_cpu(dev->new)); + else if (ui->data_len == sizeof(dev->huge)) + rdev = huge_decode_dev(le64_to_cpu(dev->huge)); + else { + err = 13; + goto out_invalid; + } + memcpy(ui->data, ino->data, ui->data_len); + inode->i_op = &ubifs_file_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + break; + } + case S_IFSOCK: + case S_IFIFO: + inode->i_op = &ubifs_file_inode_operations; + init_special_inode(inode, inode->i_mode, 0); + if (ui->data_len != 0) { + err = 14; + goto out_invalid; + } + break; + default: + err = 15; + goto out_invalid; + } + + ubifs_set_inode_flags(inode); + kfree(ino); + return; + +out_invalid: + ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err); + dbg_dump_inode(c, inode); + dbg_dump_node(c, ino); + err = -EINVAL; +out_ino: + kfree(ino); +out: + ubifs_err("failed to read inode %lu, error %d", inode->i_ino, err); + make_bad_inode(inode); + return; +} + +static struct inode *ubifs_alloc_inode(struct super_block *sb) +{ + struct ubifs_inode *ui; + + ui = kmem_cache_alloc(ubifs_inode_slab, GFP_NOFS); + if (!ui) + return NULL; + + memset((void *)ui + sizeof(struct inode), 0, + sizeof(struct ubifs_inode) - sizeof(struct inode)); + mutex_init(&ui->ui_mutex); + spin_lock_init(&ui->ui_lock); + return &ui->vfs_inode; +}; + +static void ubifs_destroy_inode(struct inode *inode) +{ + struct ubifs_inode *ui = ubifs_inode(inode); + + kfree(ui->data); + kmem_cache_free(ubifs_inode_slab, inode); +} + +/* + * Note, Linux write-back code calls this without 'i_mutex'. + */ +static int ubifs_write_inode(struct inode *inode, int wait) +{ + int err = 0; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + + ubifs_assert(!ui->xattr); + if (is_bad_inode(inode)) + return 0; + + mutex_lock(&ui->ui_mutex); + /* + * Due to races between write-back forced by budgeting + * (see 'sync_some_inodes()') and pdflush write-back, the inode may + * have already been synchronized, do not do this again. This might + * also happen if it was synchronized in an VFS operation, e.g. + * 'ubifs_link()'. + */ + if (!ui->dirty) { + mutex_unlock(&ui->ui_mutex); + return 0; + } + + /* + * As an optimization, do not write orphan inodes to the media just + * because this is not needed. + */ + dbg_gen("inode %lu, mode %#x, nlink %u", + inode->i_ino, (int)inode->i_mode, inode->i_nlink); + if (inode->i_nlink) { + err = ubifs_jnl_write_inode(c, inode); + if (err) + ubifs_err("can't write inode %lu, error %d", + inode->i_ino, err); + } + + ui->dirty = 0; + mutex_unlock(&ui->ui_mutex); + ubifs_release_dirty_inode_budget(c, ui); + return err; +} + +static void ubifs_delete_inode(struct inode *inode) +{ + int err; + struct ubifs_info *c = inode->i_sb->s_fs_info; + struct ubifs_inode *ui = ubifs_inode(inode); + + if (ui->xattr) + /* + * Extended attribute inode deletions are fully handled in + * 'ubifs_removexattr()'. These inodes are special and have + * limited usage, so there is nothing to do here. + */ + goto out; + + dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode); + ubifs_assert(!atomic_read(&inode->i_count)); + ubifs_assert(inode->i_nlink == 0); + + truncate_inode_pages(&inode->i_data, 0); + if (is_bad_inode(inode)) + goto out; + + ui->ui_size = inode->i_size = 0; + err = ubifs_jnl_delete_inode(c, inode); + if (err) + /* + * Worst case we have a lost orphan inode wasting space, so a + * simple error message is OK here. + */ + ubifs_err("can't delete inode %lu, error %d", + inode->i_ino, err); + +out: + if (ui->dirty) + ubifs_release_dirty_inode_budget(c, ui); + clear_inode(inode); +} + +static void ubifs_dirty_inode(struct inode *inode) +{ + struct ubifs_inode *ui = ubifs_inode(inode); + + ubifs_assert(mutex_is_locked(&ui->ui_mutex)); + if (!ui->dirty) { + ui->dirty = 1; + dbg_gen("inode %lu", inode->i_ino); + } +} + +static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct ubifs_info *c = dentry->d_sb->s_fs_info; + unsigned long long free; + __le32 *uuid = (__le32 *)c->uuid; + + free = ubifs_get_free_space(c); + dbg_gen("free space %lld bytes (%lld blocks)", + free, free >> UBIFS_BLOCK_SHIFT); + + buf->f_type = UBIFS_SUPER_MAGIC; + buf->f_bsize = UBIFS_BLOCK_SIZE; + buf->f_blocks = c->block_cnt; + buf->f_bfree = free >> UBIFS_BLOCK_SHIFT; + if (free > c->report_rp_size) + buf->f_bavail = (free - c->report_rp_size) >> UBIFS_BLOCK_SHIFT; + else + buf->f_bavail = 0; + buf->f_files = 0; + buf->f_ffree = 0; + buf->f_namelen = UBIFS_MAX_NLEN; + buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]); + buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]); + ubifs_assert(buf->f_bfree <= c->block_cnt); + return 0; +} + +static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt) +{ + struct ubifs_info *c = mnt->mnt_sb->s_fs_info; + + if (c->mount_opts.unmount_mode == 2) + seq_printf(s, ",fast_unmount"); + else if (c->mount_opts.unmount_mode == 1) + seq_printf(s, ",norm_unmount"); + + if (c->mount_opts.bulk_read == 2) + seq_printf(s, ",bulk_read"); + else if (c->mount_opts.bulk_read == 1) + seq_printf(s, ",no_bulk_read"); + + if (c->mount_opts.chk_data_crc == 2) + seq_printf(s, ",chk_data_crc"); + else if (c->mount_opts.chk_data_crc == 1) + seq_printf(s, ",no_chk_data_crc"); + + if (c->mount_opts.override_compr) { + seq_printf(s, ",compr=%s", + ubifs_compr_name(c->mount_opts.compr_type)); + } + + return 0; +} + +static int ubifs_sync_fs(struct super_block *sb, int wait) +{ + int i, err; + struct ubifs_info *c = sb->s_fs_info; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .range_start = 0, + .range_end = LLONG_MAX, + .nr_to_write = LONG_MAX, + }; + + /* + * Zero @wait is just an advisory thing to help the file system shove + * lots of data into the queues, and there will be the second + * '->sync_fs()' call, with non-zero @wait. + */ + if (!wait) + return 0; + + if (sb->s_flags & MS_RDONLY) + return 0; + + /* + * VFS calls '->sync_fs()' before synchronizing all dirty inodes and + * pages, so synchronize them first, then commit the journal. Strictly + * speaking, it is not necessary to commit the journal here, + * synchronizing write-buffers would be enough. But committing makes + * UBIFS free space predictions much more accurate, so we want to let + * the user be able to get more accurate results of 'statfs()' after + * they synchronize the file system. + */ + generic_sync_sb_inodes(sb, &wbc); + + /* + * Synchronize write buffers, because 'ubifs_run_commit()' does not + * do this if it waits for an already running commit. + */ + for (i = 0; i < c->jhead_cnt; i++) { + err = ubifs_wbuf_sync(&c->jheads[i].wbuf); + if (err) + return err; + } + + err = ubifs_run_commit(c); + if (err) + return err; + + return ubi_sync(c->vi.ubi_num); +} + +/** + * init_constants_early - initialize UBIFS constants. + * @c: UBIFS file-system description object + * + * This function initialize UBIFS constants which do not need the superblock to + * be read. It also checks that the UBI volume satisfies basic UBIFS + * requirements. Returns zero in case of success and a negative error code in + * case of failure. + */ +static int init_constants_early(struct ubifs_info *c) +{ + if (c->vi.corrupted) { + ubifs_warn("UBI volume is corrupted - read-only mode"); + c->ro_media = 1; + } + + if (c->di.ro_mode) { + ubifs_msg("read-only UBI device"); + c->ro_media = 1; + } + + if (c->vi.vol_type == UBI_STATIC_VOLUME) { + ubifs_msg("static UBI volume - read-only mode"); + c->ro_media = 1; + } + + c->leb_cnt = c->vi.size; + c->leb_size = c->vi.usable_leb_size; + c->half_leb_size = c->leb_size / 2; + c->min_io_size = c->di.min_io_size; + c->min_io_shift = fls(c->min_io_size) - 1; + + if (c->leb_size < UBIFS_MIN_LEB_SZ) { + ubifs_err("too small LEBs (%d bytes), min. is %d bytes", + c->leb_size, UBIFS_MIN_LEB_SZ); + return -EINVAL; + } + + if (c->leb_cnt < UBIFS_MIN_LEB_CNT) { + ubifs_err("too few LEBs (%d), min. is %d", + c->leb_cnt, UBIFS_MIN_LEB_CNT); + return -EINVAL; + } + + if (!is_power_of_2(c->min_io_size)) { + ubifs_err("bad min. I/O size %d", c->min_io_size); + return -EINVAL; + } + + /* + * UBIFS aligns all node to 8-byte boundary, so to make function in + * io.c simpler, assume minimum I/O unit size to be 8 bytes if it is + * less than 8. + */ + if (c->min_io_size < 8) { + c->min_io_size = 8; + c->min_io_shift = 3; + } + + c->ref_node_alsz = ALIGN(UBIFS_REF_NODE_SZ, c->min_io_size); + c->mst_node_alsz = ALIGN(UBIFS_MST_NODE_SZ, c->min_io_size); + + /* + * Initialize node length ranges which are mostly needed for node + * length validation. + */ + c->ranges[UBIFS_PAD_NODE].len = UBIFS_PAD_NODE_SZ; + c->ranges[UBIFS_SB_NODE].len = UBIFS_SB_NODE_SZ; + c->ranges[UBIFS_MST_NODE].len = UBIFS_MST_NODE_SZ; + c->ranges[UBIFS_REF_NODE].len = UBIFS_REF_NODE_SZ; + c->ranges[UBIFS_TRUN_NODE].len = UBIFS_TRUN_NODE_SZ; + c->ranges[UBIFS_CS_NODE].len = UBIFS_CS_NODE_SZ; + + c->ranges[UBIFS_INO_NODE].min_len = UBIFS_INO_NODE_SZ; + c->ranges[UBIFS_INO_NODE].max_len = UBIFS_MAX_INO_NODE_SZ; + c->ranges[UBIFS_ORPH_NODE].min_len = + UBIFS_ORPH_NODE_SZ + sizeof(__le64); + c->ranges[UBIFS_ORPH_NODE].max_len = c->leb_size; + c->ranges[UBIFS_DENT_NODE].min_len = UBIFS_DENT_NODE_SZ; + c->ranges[UBIFS_DENT_NODE].max_len = UBIFS_MAX_DENT_NODE_SZ; + c->ranges[UBIFS_XENT_NODE].min_len = UBIFS_XENT_NODE_SZ; + c->ranges[UBIFS_XENT_NODE].max_len = UBIFS_MAX_XENT_NODE_SZ; + c->ranges[UBIFS_DATA_NODE].min_len = UBIFS_DATA_NODE_SZ; + c->ranges[UBIFS_DATA_NODE].max_len = UBIFS_MAX_DATA_NODE_SZ; + /* + * Minimum indexing node size is amended later when superblock is + * read and the key length is known. + */ + c->ranges[UBIFS_IDX_NODE].min_len = UBIFS_IDX_NODE_SZ + UBIFS_BRANCH_SZ; + /* + * Maximum indexing node size is amended later when superblock is + * read and the fanout is known. + */ + c->ranges[UBIFS_IDX_NODE].max_len = INT_MAX; + + /* + * Initialize dead and dark LEB space watermarks. See gc.c for comments + * about these values. + */ + c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); + c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); + + /* + * Calculate how many bytes would be wasted at the end of LEB if it was + * fully filled with data nodes of maximum size. This is used in + * calculations when reporting free space. + */ + c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ; + + /* Buffer size for bulk-reads */ + c->max_bu_buf_len = UBIFS_MAX_BULK_READ * UBIFS_MAX_DATA_NODE_SZ; + if (c->max_bu_buf_len > c->leb_size) + c->max_bu_buf_len = c->leb_size; + return 0; +} + +/** + * bud_wbuf_callback - bud LEB write-buffer synchronization call-back. + * @c: UBIFS file-system description object + * @lnum: LEB the write-buffer was synchronized to + * @free: how many free bytes left in this LEB + * @pad: how many bytes were padded + * + * This is a callback function which is called by the I/O unit when the + * write-buffer is synchronized. We need this to correctly maintain space + * accounting in bud logical eraseblocks. This function returns zero in case of + * success and a negative error code in case of failure. + * + * This function actually belongs to the journal, but we keep it here because + * we want to keep it static. + */ +static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad) +{ + return ubifs_update_one_lp(c, lnum, free, pad, 0, 0); +} + +/* + * init_constants_sb - initialize UBIFS constants. + * @c: UBIFS file-system description object + * + * This is a helper function which initializes various UBIFS constants after + * the superblock has been read. It also checks various UBIFS parameters and + * makes sure they are all right. Returns zero in case of success and a + * negative error code in case of failure. + */ +static int init_constants_sb(struct ubifs_info *c) +{ + int tmp, err; + long long tmp64; + + c->main_bytes = (long long)c->main_lebs * c->leb_size; + c->max_znode_sz = sizeof(struct ubifs_znode) + + c->fanout * sizeof(struct ubifs_zbranch); + + tmp = ubifs_idx_node_sz(c, 1); + c->ranges[UBIFS_IDX_NODE].min_len = tmp; + c->min_idx_node_sz = ALIGN(tmp, 8); + + tmp = ubifs_idx_node_sz(c, c->fanout); + c->ranges[UBIFS_IDX_NODE].max_len = tmp; + c->max_idx_node_sz = ALIGN(tmp, 8); + + /* Make sure LEB size is large enough to fit full commit */ + tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt; + tmp = ALIGN(tmp, c->min_io_size); + if (tmp > c->leb_size) { + dbg_err("too small LEB size %d, at least %d needed", + c->leb_size, tmp); + return -EINVAL; + } + + /* + * Make sure that the log is large enough to fit reference nodes for + * all buds plus one reserved LEB. + */ + tmp64 = c->max_bud_bytes + c->leb_size - 1; + c->max_bud_cnt = div_u64(tmp64, c->leb_size); + tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1); + tmp /= c->leb_size; + tmp += 1; + if (c->log_lebs < tmp) { + dbg_err("too small log %d LEBs, required min. %d LEBs", + c->log_lebs, tmp); + return -EINVAL; + } + + /* + * When budgeting we assume worst-case scenarios when the pages are not + * be compressed and direntries are of the maximum size. + * + * Note, data, which may be stored in inodes is budgeted separately, so + * it is not included into 'c->inode_budget'. + */ + c->page_budget = UBIFS_MAX_DATA_NODE_SZ * UBIFS_BLOCKS_PER_PAGE; + c->inode_budget = UBIFS_INO_NODE_SZ; + c->dent_budget = UBIFS_MAX_DENT_NODE_SZ; + + /* + * When the amount of flash space used by buds becomes + * 'c->max_bud_bytes', UBIFS just blocks all writers and starts commit. + * The writers are unblocked when the commit is finished. To avoid + * writers to be blocked UBIFS initiates background commit in advance, + * when number of bud bytes becomes above the limit defined below. + */ + c->bg_bud_bytes = (c->max_bud_bytes * 13) >> 4; + + /* + * Ensure minimum journal size. All the bytes in the journal heads are + * considered to be used, when calculating the current journal usage. + * Consequently, if the journal is too small, UBIFS will treat it as + * always full. + */ + tmp64 = (long long)(c->jhead_cnt + 1) * c->leb_size + 1; + if (c->bg_bud_bytes < tmp64) + c->bg_bud_bytes = tmp64; + if (c->max_bud_bytes < tmp64 + c->leb_size) + c->max_bud_bytes = tmp64 + c->leb_size; + + err = ubifs_calc_lpt_geom(c); + if (err) + return err; + + /* Initialize effective LEB size used in budgeting calculations */ + c->idx_leb_size = c->leb_size - c->max_idx_node_sz; + return 0; +} + +/* + * init_constants_master - initialize UBIFS constants. + * @c: UBIFS file-system description object + * + * This is a helper function which initializes various UBIFS constants after + * the master node has been read. It also checks various UBIFS parameters and + * makes sure they are all right. + */ +static void init_constants_master(struct ubifs_info *c) +{ + long long tmp64; + + c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + c->report_rp_size = ubifs_reported_space(c, c->rp_size); + + /* + * Calculate total amount of FS blocks. This number is not used + * internally because it does not make much sense for UBIFS, but it is + * necessary to report something for the 'statfs()' call. + * + * Subtract the LEB reserved for GC, the LEB which is reserved for + * deletions, minimum LEBs for the index, and assume only one journal + * head is available. + */ + tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt + 1; + tmp64 *= (long long)c->leb_size - c->leb_overhead; + tmp64 = ubifs_reported_space(c, tmp64); + c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; +} + +/** + * take_gc_lnum - reserve GC LEB. + * @c: UBIFS file-system description object + * + * This function ensures that the LEB reserved for garbage collection is marked + * as "taken" in lprops. We also have to set free space to LEB size and dirty + * space to zero, because lprops may contain out-of-date information if the + * file-system was un-mounted before it has been committed. This function + * returns zero in case of success and a negative error code in case of + * failure. + */ +static int take_gc_lnum(struct ubifs_info *c) +{ + int err; + + if (c->gc_lnum == -1) { + ubifs_err("no LEB for GC"); + return -EINVAL; + } + + /* And we have to tell lprops that this LEB is taken */ + err = ubifs_change_one_lp(c, c->gc_lnum, c->leb_size, 0, + LPROPS_TAKEN, 0, 0); + return err; +} + +/** + * alloc_wbufs - allocate write-buffers. + * @c: UBIFS file-system description object + * + * This helper function allocates and initializes UBIFS write-buffers. Returns + * zero in case of success and %-ENOMEM in case of failure. + */ +static int alloc_wbufs(struct ubifs_info *c) +{ + int i, err; + + c->jheads = kzalloc(c->jhead_cnt * sizeof(struct ubifs_jhead), + GFP_KERNEL); + if (!c->jheads) + return -ENOMEM; + + /* Initialize journal heads */ + for (i = 0; i < c->jhead_cnt; i++) { + INIT_LIST_HEAD(&c->jheads[i].buds_list); + err = ubifs_wbuf_init(c, &c->jheads[i].wbuf); + if (err) + return err; + + c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback; + c->jheads[i].wbuf.jhead = i; + } + + c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM; + /* + * Garbage Collector head likely contains long-term data and + * does not need to be synchronized by timer. + */ + c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM; + c->jheads[GCHD].wbuf.timeout = 0; + + return 0; +} + +/** + * free_wbufs - free write-buffers. + * @c: UBIFS file-system description object + */ +static void free_wbufs(struct ubifs_info *c) +{ + int i; + + if (c->jheads) { + for (i = 0; i < c->jhead_cnt; i++) { + kfree(c->jheads[i].wbuf.buf); + kfree(c->jheads[i].wbuf.inodes); + } + kfree(c->jheads); + c->jheads = NULL; + } +} + +/** + * free_orphans - free orphans. + * @c: UBIFS file-system description object + */ +static void free_orphans(struct ubifs_info *c) +{ + struct ubifs_orphan *orph; + + while (c->orph_dnext) { + orph = c->orph_dnext; + c->orph_dnext = orph->dnext; + list_del(&orph->list); + kfree(orph); + } + + while (!list_empty(&c->orph_list)) { + orph = list_entry(c->orph_list.next, struct ubifs_orphan, list); + list_del(&orph->list); + kfree(orph); + dbg_err("orphan list not empty at unmount"); + } + + vfree(c->orph_buf); + c->orph_buf = NULL; +} + +/** + * free_buds - free per-bud objects. + * @c: UBIFS file-system description object + */ +static void free_buds(struct ubifs_info *c) +{ + struct rb_node *this = c->buds.rb_node; + struct ubifs_bud *bud; + + while (this) { + if (this->rb_left) + this = this->rb_left; + else if (this->rb_right) + this = this->rb_right; + else { + bud = rb_entry(this, struct ubifs_bud, rb); + this = rb_parent(this); + if (this) { + if (this->rb_left == &bud->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + kfree(bud); + } + } +} + +/** + * check_volume_empty - check if the UBI volume is empty. + * @c: UBIFS file-system description object + * + * This function checks if the UBIFS volume is empty by looking if its LEBs are + * mapped or not. The result of checking is stored in the @c->empty variable. + * Returns zero in case of success and a negative error code in case of + * failure. + */ +static int check_volume_empty(struct ubifs_info *c) +{ + int lnum, err; + + c->empty = 1; + for (lnum = 0; lnum < c->leb_cnt; lnum++) { + err = ubi_is_mapped(c->ubi, lnum); + if (unlikely(err < 0)) + return err; + if (err == 1) { + c->empty = 0; + break; + } + + cond_resched(); + } + + return 0; +} + +/* + * UBIFS mount options. + * + * Opt_fast_unmount: do not run a journal commit before un-mounting + * Opt_norm_unmount: run a journal commit before un-mounting + * Opt_bulk_read: enable bulk-reads + * Opt_no_bulk_read: disable bulk-reads + * Opt_chk_data_crc: check CRCs when reading data nodes + * Opt_no_chk_data_crc: do not check CRCs when reading data nodes + * Opt_override_compr: override default compressor + * Opt_err: just end of array marker + */ +enum { + Opt_fast_unmount, + Opt_norm_unmount, + Opt_bulk_read, + Opt_no_bulk_read, + Opt_chk_data_crc, + Opt_no_chk_data_crc, + Opt_override_compr, + Opt_err, +}; + +static match_table_t tokens = { + {Opt_fast_unmount, "fast_unmount"}, + {Opt_norm_unmount, "norm_unmount"}, + {Opt_bulk_read, "bulk_read"}, + {Opt_no_bulk_read, "no_bulk_read"}, + {Opt_chk_data_crc, "chk_data_crc"}, + {Opt_no_chk_data_crc, "no_chk_data_crc"}, + {Opt_override_compr, "compr=%s"}, + {Opt_err, NULL}, +}; + +/** + * ubifs_parse_options - parse mount parameters. + * @c: UBIFS file-system description object + * @options: parameters to parse + * @is_remount: non-zero if this is FS re-mount + * + * This function parses UBIFS mount options and returns zero in case success + * and a negative error code in case of failure. + */ +static int ubifs_parse_options(struct ubifs_info *c, char *options, + int is_remount) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + + if (!options) + return 0; + + while ((p = strsep(&options, ","))) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + /* + * %Opt_fast_unmount and %Opt_norm_unmount options are ignored. + * We accepte them in order to be backware-compatible. But this + * should be removed at some point. + */ + case Opt_fast_unmount: + c->mount_opts.unmount_mode = 2; + break; + case Opt_norm_unmount: + c->mount_opts.unmount_mode = 1; + break; + case Opt_bulk_read: + c->mount_opts.bulk_read = 2; + c->bulk_read = 1; + break; + case Opt_no_bulk_read: + c->mount_opts.bulk_read = 1; + c->bulk_read = 0; + break; + case Opt_chk_data_crc: + c->mount_opts.chk_data_crc = 2; + c->no_chk_data_crc = 0; + break; + case Opt_no_chk_data_crc: + c->mount_opts.chk_data_crc = 1; + c->no_chk_data_crc = 1; + break; + case Opt_override_compr: + { + char *name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (!strcmp(name, "none")) + c->mount_opts.compr_type = UBIFS_COMPR_NONE; + else if (!strcmp(name, "lzo")) + c->mount_opts.compr_type = UBIFS_COMPR_LZO; + else if (!strcmp(name, "zlib")) + c->mount_opts.compr_type = UBIFS_COMPR_ZLIB; + else { + ubifs_err("unknown compressor \"%s\"", name); + kfree(name); + return -EINVAL; + } + kfree(name); + c->mount_opts.override_compr = 1; + c->default_compr = c->mount_opts.compr_type; + break; + } + default: + ubifs_err("unrecognized mount option \"%s\" " + "or missing value", p); + return -EINVAL; + } + } + + return 0; +} + +/** + * destroy_journal - destroy journal data structures. + * @c: UBIFS file-system description object + * + * This function destroys journal data structures including those that may have + * been created by recovery functions. + */ +static void destroy_journal(struct ubifs_info *c) +{ + while (!list_empty(&c->unclean_leb_list)) { + struct ubifs_unclean_leb *ucleb; + + ucleb = list_entry(c->unclean_leb_list.next, + struct ubifs_unclean_leb, list); + list_del(&ucleb->list); + kfree(ucleb); + } + while (!list_empty(&c->old_buds)) { + struct ubifs_bud *bud; + + bud = list_entry(c->old_buds.next, struct ubifs_bud, list); + list_del(&bud->list); + kfree(bud); + } + ubifs_destroy_idx_gc(c); + ubifs_destroy_size_tree(c); + ubifs_tnc_close(c); + free_buds(c); +} + +/** + * bu_init - initialize bulk-read information. + * @c: UBIFS file-system description object + */ +static void bu_init(struct ubifs_info *c) +{ + ubifs_assert(c->bulk_read == 1); + + if (c->bu.buf) + return; /* Already initialized */ + +again: + c->bu.buf = kmalloc(c->max_bu_buf_len, GFP_KERNEL | __GFP_NOWARN); + if (!c->bu.buf) { + if (c->max_bu_buf_len > UBIFS_KMALLOC_OK) { + c->max_bu_buf_len = UBIFS_KMALLOC_OK; + goto again; + } + + /* Just disable bulk-read */ + ubifs_warn("Cannot allocate %d bytes of memory for bulk-read, " + "disabling it", c->max_bu_buf_len); + c->mount_opts.bulk_read = 1; + c->bulk_read = 0; + return; + } +} + +/** + * check_free_space - check if there is enough free space to mount. + * @c: UBIFS file-system description object + * + * This function makes sure UBIFS has enough free space to be mounted in + * read/write mode. UBIFS must always have some free space to allow deletions. + */ +static int check_free_space(struct ubifs_info *c) +{ + ubifs_assert(c->dark_wm > 0); + if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) { + ubifs_err("insufficient free space to mount in read/write mode"); + dbg_dump_budg(c); + dbg_dump_lprops(c); + return -ENOSPC; + } + return 0; +} + +/** + * mount_ubifs - mount UBIFS file-system. + * @c: UBIFS file-system description object + * + * This function mounts UBIFS file system. Returns zero in case of success and + * a negative error code in case of failure. + * + * Note, the function does not de-allocate resources it it fails half way + * through, and the caller has to do this instead. + */ +static int mount_ubifs(struct ubifs_info *c) +{ + struct super_block *sb = c->vfs_sb; + int err, mounted_read_only = (sb->s_flags & MS_RDONLY); + long long x; + size_t sz; + + err = init_constants_early(c); + if (err) + return err; + + err = ubifs_debugging_init(c); + if (err) + return err; + + err = check_volume_empty(c); + if (err) + goto out_free; + + if (c->empty && (mounted_read_only || c->ro_media)) { + /* + * This UBI volume is empty, and read-only, or the file system + * is mounted read-only - we cannot format it. + */ + ubifs_err("can't format empty UBI volume: read-only %s", + c->ro_media ? "UBI volume" : "mount"); + err = -EROFS; + goto out_free; + } + + if (c->ro_media && !mounted_read_only) { + ubifs_err("cannot mount read-write - read-only media"); + err = -EROFS; + goto out_free; + } + + /* + * The requirement for the buffer is that it should fit indexing B-tree + * height amount of integers. We assume the height if the TNC tree will + * never exceed 64. + */ + err = -ENOMEM; + c->bottom_up_buf = kmalloc(BOTTOM_UP_HEIGHT * sizeof(int), GFP_KERNEL); + if (!c->bottom_up_buf) + goto out_free; + + c->sbuf = vmalloc(c->leb_size); + if (!c->sbuf) + goto out_free; + + if (!mounted_read_only) { + c->ileb_buf = vmalloc(c->leb_size); + if (!c->ileb_buf) + goto out_free; + } + + if (c->bulk_read == 1) + bu_init(c); + + /* + * We have to check all CRCs, even for data nodes, when we mount the FS + * (specifically, when we are replaying). + */ + c->always_chk_crc = 1; + + err = ubifs_read_superblock(c); + if (err) + goto out_free; + + /* + * Make sure the compressor which is set as default in the superblock + * or overridden by mount options is actually compiled in. + */ + if (!ubifs_compr_present(c->default_compr)) { + ubifs_err("'compressor \"%s\" is not compiled in", + ubifs_compr_name(c->default_compr)); + goto out_free; + } + + err = init_constants_sb(c); + if (err) + goto out_free; + + sz = ALIGN(c->max_idx_node_sz, c->min_io_size); + sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size); + c->cbuf = kmalloc(sz, GFP_NOFS); + if (!c->cbuf) { + err = -ENOMEM; + goto out_free; + } + + sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); + if (!mounted_read_only) { + err = alloc_wbufs(c); + if (err) + goto out_cbuf; + + /* Create background thread */ + c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); + if (IS_ERR(c->bgt)) { + err = PTR_ERR(c->bgt); + c->bgt = NULL; + ubifs_err("cannot spawn \"%s\", error %d", + c->bgt_name, err); + goto out_wbufs; + } + wake_up_process(c->bgt); + } + + err = ubifs_read_master(c); + if (err) + goto out_master; + + init_constants_master(c); + + if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) { + ubifs_msg("recovery needed"); + c->need_recovery = 1; + if (!mounted_read_only) { + err = ubifs_recover_inl_heads(c, c->sbuf); + if (err) + goto out_master; + } + } else if (!mounted_read_only) { + /* + * Set the "dirty" flag so that if we reboot uncleanly we + * will notice this immediately on the next mount. + */ + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); + err = ubifs_write_master(c); + if (err) + goto out_master; + } + + err = ubifs_lpt_init(c, 1, !mounted_read_only); + if (err) + goto out_lpt; + + err = dbg_check_idx_size(c, c->old_idx_sz); + if (err) + goto out_lpt; + + err = ubifs_replay_journal(c); + if (err) + goto out_journal; + + err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only); + if (err) + goto out_orphans; + + if (!mounted_read_only) { + int lnum; + + err = check_free_space(c); + if (err) + goto out_orphans; + + /* Check for enough log space */ + lnum = c->lhead_lnum + 1; + if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) + lnum = UBIFS_LOG_LNUM; + if (lnum == c->ltail_lnum) { + err = ubifs_consolidate_log(c); + if (err) + goto out_orphans; + } + + if (c->need_recovery) { + err = ubifs_recover_size(c); + if (err) + goto out_orphans; + err = ubifs_rcvry_gc_commit(c); + } else { + err = take_gc_lnum(c); + if (err) + goto out_orphans; + + /* + * GC LEB may contain garbage if there was an unclean + * reboot, and it should be un-mapped. + */ + err = ubifs_leb_unmap(c, c->gc_lnum); + if (err) + return err; + } + + err = dbg_check_lprops(c); + if (err) + goto out_orphans; + } else if (c->need_recovery) { + err = ubifs_recover_size(c); + if (err) + goto out_orphans; + } else { + /* + * Even if we mount read-only, we have to set space in GC LEB + * to proper value because this affects UBIFS free space + * reporting. We do not want to have a situation when + * re-mounting from R/O to R/W changes amount of free space. + */ + err = take_gc_lnum(c); + if (err) + goto out_orphans; + } + + spin_lock(&ubifs_infos_lock); + list_add_tail(&c->infos_list, &ubifs_infos); + spin_unlock(&ubifs_infos_lock); + + if (c->need_recovery) { + if (mounted_read_only) + ubifs_msg("recovery deferred"); + else { + c->need_recovery = 0; + ubifs_msg("recovery completed"); + /* + * GC LEB has to be empty and taken at this point. But + * the journal head LEBs may also be accounted as + * "empty taken" if they are empty. + */ + ubifs_assert(c->lst.taken_empty_lebs > 0); + } + } else + ubifs_assert(c->lst.taken_empty_lebs > 0); + + err = dbg_check_filesystem(c); + if (err) + goto out_infos; + + err = dbg_debugfs_init_fs(c); + if (err) + goto out_infos; + + c->always_chk_crc = 0; + + ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"", + c->vi.ubi_num, c->vi.vol_id, c->vi.name); + if (mounted_read_only) + ubifs_msg("mounted read-only"); + x = (long long)c->main_lebs * c->leb_size; + ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d " + "LEBs)", x, x >> 10, x >> 20, c->main_lebs); + x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; + ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d " + "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); + ubifs_msg("media format: w%d/r%d (latest is w%d/r%d)", + c->fmt_version, c->ro_compat_version, + UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); + ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); + ubifs_msg("reserved for root: %llu bytes (%llu KiB)", + c->report_rp_size, c->report_rp_size >> 10); + + dbg_msg("compiled on: " __DATE__ " at " __TIME__); + dbg_msg("min. I/O unit size: %d bytes", c->min_io_size); + dbg_msg("LEB size: %d bytes (%d KiB)", + c->leb_size, c->leb_size >> 10); + dbg_msg("data journal heads: %d", + c->jhead_cnt - NONDATA_JHEADS_CNT); + dbg_msg("UUID: %02X%02X%02X%02X-%02X%02X" + "-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X", + c->uuid[0], c->uuid[1], c->uuid[2], c->uuid[3], + c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7], + c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11], + c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]); + dbg_msg("big_lpt %d", c->big_lpt); + dbg_msg("log LEBs: %d (%d - %d)", + c->log_lebs, UBIFS_LOG_LNUM, c->log_last); + dbg_msg("LPT area LEBs: %d (%d - %d)", + c->lpt_lebs, c->lpt_first, c->lpt_last); + dbg_msg("orphan area LEBs: %d (%d - %d)", + c->orph_lebs, c->orph_first, c->orph_last); + dbg_msg("main area LEBs: %d (%d - %d)", + c->main_lebs, c->main_first, c->leb_cnt - 1); + dbg_msg("index LEBs: %d", c->lst.idx_lebs); + dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)", + c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20); + dbg_msg("key hash type: %d", c->key_hash_type); + dbg_msg("tree fanout: %d", c->fanout); + dbg_msg("reserved GC LEB: %d", c->gc_lnum); + dbg_msg("first main LEB: %d", c->main_first); + dbg_msg("max. znode size %d", c->max_znode_sz); + dbg_msg("max. index node size %d", c->max_idx_node_sz); + dbg_msg("node sizes: data %zu, inode %zu, dentry %zu", + UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ); + dbg_msg("node sizes: trun %zu, sb %zu, master %zu", + UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ); + dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu", + UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ); + dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu", + UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ, + UBIFS_MAX_DENT_NODE_SZ); + dbg_msg("dead watermark: %d", c->dead_wm); + dbg_msg("dark watermark: %d", c->dark_wm); + dbg_msg("LEB overhead: %d", c->leb_overhead); + x = (long long)c->main_lebs * c->dark_wm; + dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)", + x, x >> 10, x >> 20); + dbg_msg("maximum bud bytes: %lld (%lld KiB, %lld MiB)", + c->max_bud_bytes, c->max_bud_bytes >> 10, + c->max_bud_bytes >> 20); + dbg_msg("BG commit bud bytes: %lld (%lld KiB, %lld MiB)", + c->bg_bud_bytes, c->bg_bud_bytes >> 10, + c->bg_bud_bytes >> 20); + dbg_msg("current bud bytes %lld (%lld KiB, %lld MiB)", + c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20); + dbg_msg("max. seq. number: %llu", c->max_sqnum); + dbg_msg("commit number: %llu", c->cmt_no); + + return 0; + +out_infos: + spin_lock(&ubifs_infos_lock); + list_del(&c->infos_list); + spin_unlock(&ubifs_infos_lock); +out_orphans: + free_orphans(c); +out_journal: + destroy_journal(c); +out_lpt: + ubifs_lpt_free(c, 0); +out_master: + kfree(c->mst_node); + kfree(c->rcvrd_mst_node); + if (c->bgt) + kthread_stop(c->bgt); +out_wbufs: + free_wbufs(c); +out_cbuf: + kfree(c->cbuf); +out_free: + kfree(c->bu.buf); + vfree(c->ileb_buf); + vfree(c->sbuf); + kfree(c->bottom_up_buf); + ubifs_debugging_exit(c); + return err; +} + +/** + * ubifs_umount - un-mount UBIFS file-system. + * @c: UBIFS file-system description object + * + * Note, this function is called to free allocated resourced when un-mounting, + * as well as free resources when an error occurred while we were half way + * through mounting (error path cleanup function). So it has to make sure the + * resource was actually allocated before freeing it. + */ +static void ubifs_umount(struct ubifs_info *c) +{ + dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num, + c->vi.vol_id); + + dbg_debugfs_exit_fs(c); + spin_lock(&ubifs_infos_lock); + list_del(&c->infos_list); + spin_unlock(&ubifs_infos_lock); + + if (c->bgt) + kthread_stop(c->bgt); + + destroy_journal(c); + free_wbufs(c); + free_orphans(c); + ubifs_lpt_free(c, 0); + + kfree(c->cbuf); + kfree(c->rcvrd_mst_node); + kfree(c->mst_node); + kfree(c->bu.buf); + vfree(c->ileb_buf); + vfree(c->sbuf); + kfree(c->bottom_up_buf); + ubifs_debugging_exit(c); +} + +/** + * ubifs_remount_rw - re-mount in read-write mode. + * @c: UBIFS file-system description object + * + * UBIFS avoids allocating many unnecessary resources when mounted in read-only + * mode. This function allocates the needed resources and re-mounts UBIFS in + * read-write mode. + */ +static int ubifs_remount_rw(struct ubifs_info *c) +{ + int err, lnum; + + if (c->rw_incompat) { + ubifs_err("the file-system is not R/W-compatible"); + ubifs_msg("on-flash format version is w%d/r%d, but software " + "only supports up to version w%d/r%d", c->fmt_version, + c->ro_compat_version, UBIFS_FORMAT_VERSION, + UBIFS_RO_COMPAT_VERSION); + return -EROFS; + } + + mutex_lock(&c->umount_mutex); + dbg_save_space_info(c); + c->remounting_rw = 1; + c->always_chk_crc = 1; + + err = check_free_space(c); + if (err) + goto out; + + if (c->old_leb_cnt != c->leb_cnt) { + struct ubifs_sb_node *sup; + + sup = ubifs_read_sb_node(c); + if (IS_ERR(sup)) { + err = PTR_ERR(sup); + goto out; + } + sup->leb_cnt = cpu_to_le32(c->leb_cnt); + err = ubifs_write_sb_node(c, sup); + if (err) + goto out; + } + + if (c->need_recovery) { + ubifs_msg("completing deferred recovery"); + err = ubifs_write_rcvrd_mst_node(c); + if (err) + goto out; + err = ubifs_recover_size(c); + if (err) + goto out; + err = ubifs_clean_lebs(c, c->sbuf); + if (err) + goto out; + err = ubifs_recover_inl_heads(c, c->sbuf); + if (err) + goto out; + } else { + /* A readonly mount is not allowed to have orphans */ + ubifs_assert(c->tot_orphans == 0); + err = ubifs_clear_orphans(c); + if (err) + goto out; + } + + if (!(c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY))) { + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY); + err = ubifs_write_master(c); + if (err) + goto out; + } + + c->ileb_buf = vmalloc(c->leb_size); + if (!c->ileb_buf) { + err = -ENOMEM; + goto out; + } + + err = ubifs_lpt_init(c, 0, 1); + if (err) + goto out; + + err = alloc_wbufs(c); + if (err) + goto out; + + ubifs_create_buds_lists(c); + + /* Create background thread */ + c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name); + if (IS_ERR(c->bgt)) { + err = PTR_ERR(c->bgt); + c->bgt = NULL; + ubifs_err("cannot spawn \"%s\", error %d", + c->bgt_name, err); + goto out; + } + wake_up_process(c->bgt); + + c->orph_buf = vmalloc(c->leb_size); + if (!c->orph_buf) { + err = -ENOMEM; + goto out; + } + + /* Check for enough log space */ + lnum = c->lhead_lnum + 1; + if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) + lnum = UBIFS_LOG_LNUM; + if (lnum == c->ltail_lnum) { + err = ubifs_consolidate_log(c); + if (err) + goto out; + } + + if (c->need_recovery) + err = ubifs_rcvry_gc_commit(c); + else + err = ubifs_leb_unmap(c, c->gc_lnum); + if (err) + goto out; + + if (c->need_recovery) { + c->need_recovery = 0; + ubifs_msg("deferred recovery completed"); + } + + dbg_gen("re-mounted read-write"); + c->vfs_sb->s_flags &= ~MS_RDONLY; + c->remounting_rw = 0; + c->always_chk_crc = 0; + err = dbg_check_space_info(c); + mutex_unlock(&c->umount_mutex); + return err; + +out: + vfree(c->orph_buf); + c->orph_buf = NULL; + if (c->bgt) { + kthread_stop(c->bgt); + c->bgt = NULL; + } + free_wbufs(c); + vfree(c->ileb_buf); + c->ileb_buf = NULL; + ubifs_lpt_free(c, 1); + c->remounting_rw = 0; + c->always_chk_crc = 0; + mutex_unlock(&c->umount_mutex); + return err; +} + +/** + * ubifs_remount_ro - re-mount in read-only mode. + * @c: UBIFS file-system description object + * + * We assume VFS has stopped writing. Possibly the background thread could be + * running a commit, however kthread_stop will wait in that case. + */ +static void ubifs_remount_ro(struct ubifs_info *c) +{ + int i, err; + + ubifs_assert(!c->need_recovery); + ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY)); + + mutex_lock(&c->umount_mutex); + if (c->bgt) { + kthread_stop(c->bgt); + c->bgt = NULL; + } + + dbg_save_space_info(c); + + for (i = 0; i < c->jhead_cnt; i++) { + ubifs_wbuf_sync(&c->jheads[i].wbuf); + del_timer_sync(&c->jheads[i].wbuf.timer); + } + + c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); + c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); + err = ubifs_write_master(c); + if (err) + ubifs_ro_mode(c, err); + + free_wbufs(c); + vfree(c->orph_buf); + c->orph_buf = NULL; + vfree(c->ileb_buf); + c->ileb_buf = NULL; + ubifs_lpt_free(c, 1); + err = dbg_check_space_info(c); + if (err) + ubifs_ro_mode(c, err); + mutex_unlock(&c->umount_mutex); +} + +static void ubifs_put_super(struct super_block *sb) +{ + int i; + struct ubifs_info *c = sb->s_fs_info; + + ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num, + c->vi.vol_id); + /* + * The following asserts are only valid if there has not been a failure + * of the media. For example, there will be dirty inodes if we failed + * to write them back because of I/O errors. + */ + ubifs_assert(atomic_long_read(&c->dirty_pg_cnt) == 0); + ubifs_assert(c->budg_idx_growth == 0); + ubifs_assert(c->budg_dd_growth == 0); + ubifs_assert(c->budg_data_growth == 0); + + /* + * The 'c->umount_lock' prevents races between UBIFS memory shrinker + * and file system un-mount. Namely, it prevents the shrinker from + * picking this superblock for shrinking - it will be just skipped if + * the mutex is locked. + */ + mutex_lock(&c->umount_mutex); + if (!(c->vfs_sb->s_flags & MS_RDONLY)) { + /* + * First of all kill the background thread to make sure it does + * not interfere with un-mounting and freeing resources. + */ + if (c->bgt) { + kthread_stop(c->bgt); + c->bgt = NULL; + } + + /* Synchronize write-buffers */ + if (c->jheads) + for (i = 0; i < c->jhead_cnt; i++) { + ubifs_wbuf_sync(&c->jheads[i].wbuf); + del_timer_sync(&c->jheads[i].wbuf.timer); + } + + /* + * On fatal errors c->ro_media is set to 1, in which case we do + * not write the master node. + */ + if (!c->ro_media) { + /* + * We are being cleanly unmounted which means the + * orphans were killed - indicate this in the master + * node. Also save the reserved GC LEB number. + */ + int err; + + c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); + c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); + c->mst_node->gc_lnum = cpu_to_le32(c->gc_lnum); + err = ubifs_write_master(c); + if (err) + /* + * Recovery will attempt to fix the master area + * next mount, so we just print a message and + * continue to unmount normally. + */ + ubifs_err("failed to write master node, " + "error %d", err); + } + } + + ubifs_umount(c); + bdi_destroy(&c->bdi); + ubi_close_volume(c->ubi); + mutex_unlock(&c->umount_mutex); + kfree(c); +} + +static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) +{ + int err; + struct ubifs_info *c = sb->s_fs_info; + + dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags); + + err = ubifs_parse_options(c, data, 1); + if (err) { + ubifs_err("invalid or unknown remount parameter"); + return err; + } + + if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { + if (c->ro_media) { + ubifs_msg("cannot re-mount due to prior errors"); + return -EROFS; + } + err = ubifs_remount_rw(c); + if (err) + return err; + } else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) { + if (c->ro_media) { + ubifs_msg("cannot re-mount due to prior errors"); + return -EROFS; + } + ubifs_remount_ro(c); + } + + if (c->bulk_read == 1) + bu_init(c); + else { + dbg_gen("disable bulk-read"); + kfree(c->bu.buf); + c->bu.buf = NULL; + } + + ubifs_assert(c->lst.taken_empty_lebs > 0); + return 0; +} + +const struct super_operations ubifs_super_operations = { + .read_inode = ubifs_read_inode, + .alloc_inode = ubifs_alloc_inode, + .destroy_inode = ubifs_destroy_inode, + .put_super = ubifs_put_super, + .write_inode = ubifs_write_inode, + .delete_inode = ubifs_delete_inode, + .statfs = ubifs_statfs, + .dirty_inode = ubifs_dirty_inode, + .remount_fs = ubifs_remount_fs, + .show_options = ubifs_show_options, + .sync_fs = ubifs_sync_fs, +}; + +/** + * open_ubi - parse UBI device name string and open the UBI device. + * @name: UBI volume name + * @mode: UBI volume open mode + * + * There are several ways to specify UBI volumes when mounting UBIFS: + * o ubiX_Y - UBI device number X, volume Y; + * o ubiY - UBI device number 0, volume Y; + * o ubiX:NAME - mount UBI device X, volume with name NAME; + * o ubi:NAME - mount UBI device 0, volume with name NAME. + * + * Alternative '!' separator may be used instead of ':' (because some shells + * like busybox may interpret ':' as an NFS host name separator). This function + * returns ubi volume object in case of success and a negative error code in + * case of failure. + */ +static struct ubi_volume_desc *open_ubi(const char *name, int mode) +{ + int dev, vol; + char *endptr; + + if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i') + return ERR_PTR(-EINVAL); + + /* ubi:NAME method */ + if ((name[3] == ':' || name[3] == '!') && name[4] != '\0') + return ubi_open_volume_nm(0, name + 4, mode); + + if (!isdigit(name[3])) + return ERR_PTR(-EINVAL); + + dev = simple_strtoul(name + 3, &endptr, 0); + + /* ubiY method */ + if (*endptr == '\0') + return ubi_open_volume(0, dev, mode); + + /* ubiX_Y method */ + if (*endptr == '_' && isdigit(endptr[1])) { + vol = simple_strtoul(endptr + 1, &endptr, 0); + if (*endptr != '\0') + return ERR_PTR(-EINVAL); + return ubi_open_volume(dev, vol, mode); + } + + /* ubiX:NAME method */ + if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0') + return ubi_open_volume_nm(dev, ++endptr, mode); + + return ERR_PTR(-EINVAL); +} + +static int ubifs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct ubi_volume_desc *ubi = sb->s_fs_info; + struct ubifs_info *c; + struct inode *root; + int err; + + c = kzalloc(sizeof(struct ubifs_info), GFP_KERNEL); + if (!c) + return -ENOMEM; + + spin_lock_init(&c->cnt_lock); + spin_lock_init(&c->cs_lock); + spin_lock_init(&c->buds_lock); + spin_lock_init(&c->space_lock); + spin_lock_init(&c->orphan_lock); + init_rwsem(&c->commit_sem); + mutex_init(&c->lp_mutex); + mutex_init(&c->tnc_mutex); + mutex_init(&c->log_mutex); + mutex_init(&c->mst_mutex); + mutex_init(&c->umount_mutex); + mutex_init(&c->bu_mutex); + init_waitqueue_head(&c->cmt_wq); + c->buds = RB_ROOT; + c->old_idx = RB_ROOT; + c->size_tree = RB_ROOT; + c->orph_tree = RB_ROOT; + INIT_LIST_HEAD(&c->infos_list); + INIT_LIST_HEAD(&c->idx_gc); + INIT_LIST_HEAD(&c->replay_list); + INIT_LIST_HEAD(&c->replay_buds); + INIT_LIST_HEAD(&c->uncat_list); + INIT_LIST_HEAD(&c->empty_list); + INIT_LIST_HEAD(&c->freeable_list); + INIT_LIST_HEAD(&c->frdi_idx_list); + INIT_LIST_HEAD(&c->unclean_leb_list); + INIT_LIST_HEAD(&c->old_buds); + INIT_LIST_HEAD(&c->orph_list); + INIT_LIST_HEAD(&c->orph_new); + + c->highest_inum = UBIFS_FIRST_INO; + c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; + + ubi_get_volume_info(ubi, &c->vi); + ubi_get_device_info(c->vi.ubi_num, &c->di); + + /* Re-open the UBI device in read-write mode */ + c->ubi = ubi_open_volume(c->vi.ubi_num, c->vi.vol_id, UBI_READWRITE); + if (IS_ERR(c->ubi)) { + err = PTR_ERR(c->ubi); + goto out_free; + } + + /* + * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For + * UBIFS, I/O is not deferred, it is done immediately in readpage, + * which means the user would have to wait not just for their own I/O + * but the read-ahead I/O as well i.e. completely pointless. + * + * Read-ahead will be disabled because @c->bdi.ra_pages is 0. + */ + c->bdi.capabilities = BDI_CAP_MAP_COPY; + c->bdi.unplug_io_fn = default_unplug_io_fn; + err = bdi_init(&c->bdi); + if (err) + goto out_close; + + err = ubifs_parse_options(c, data, 0); + if (err) + goto out_bdi; + + c->vfs_sb = sb; + + sb->s_fs_info = c; + sb->s_magic = UBIFS_SUPER_MAGIC; + sb->s_blocksize = UBIFS_BLOCK_SIZE; + sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT; + sb->s_dev = c->vi.cdev; + sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c); + if (c->max_inode_sz > MAX_LFS_FILESIZE) + sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; + sb->s_op = &ubifs_super_operations; + + mutex_lock(&c->umount_mutex); + err = mount_ubifs(c); + if (err) { + ubifs_assert(err < 0); + goto out_unlock; + } + + /* Read the root inode */ + root = ubifs_iget(sb, UBIFS_ROOT_INO); + if (IS_ERR(root)) { + err = PTR_ERR(root); + goto out_umount; + } + + sb->s_root = d_alloc_root(root); + if (!sb->s_root) + goto out_iput; + + mutex_unlock(&c->umount_mutex); + return 0; + +out_iput: + iput(root); +out_umount: + ubifs_umount(c); +out_unlock: + mutex_unlock(&c->umount_mutex); +out_bdi: + bdi_destroy(&c->bdi); +out_close: + ubi_close_volume(c->ubi); +out_free: + kfree(c); + return err; +} + +static int sb_test(struct super_block *sb, void *data) +{ + dev_t *dev = data; + + return sb->s_dev == *dev; +} + +static int sb_set(struct super_block *sb, void *data) +{ + dev_t *dev = data; + + sb->s_dev = *dev; + return 0; +} + +static int ubifs_get_sb(struct file_system_type *fs_type, int flags, + const char *name, void *data, struct vfsmount *mnt) +{ + struct ubi_volume_desc *ubi; + struct ubi_volume_info vi; + struct super_block *sb; + int err; + + dbg_gen("name %s, flags %#x", name, flags); + + /* + * Get UBI device number and volume ID. Mount it read-only so far + * because this might be a new mount point, and UBI allows only one + * read-write user at a time. + */ + ubi = open_ubi(name, UBI_READONLY); + if (IS_ERR(ubi)) { + ubifs_err("cannot open \"%s\", error %d", + name, (int)PTR_ERR(ubi)); + return PTR_ERR(ubi); + } + ubi_get_volume_info(ubi, &vi); + + dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id); + + sb = sget(fs_type, &sb_test, &sb_set, &vi.cdev); + if (IS_ERR(sb)) { + err = PTR_ERR(sb); + goto out_close; + } + + if (sb->s_root) { + /* A new mount point for already mounted UBIFS */ + dbg_gen("this ubi volume is already mounted"); + if ((flags ^ sb->s_flags) & MS_RDONLY) { + err = -EBUSY; + goto out_deact; + } + } else { + sb->s_flags = flags; + /* + * Pass 'ubi' to 'fill_super()' in sb->s_fs_info where it is + * replaced by 'c'. + */ + sb->s_fs_info = ubi; + err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); + if (err) + goto out_deact; + /* We do not support atime */ + sb->s_flags |= MS_ACTIVE | MS_NOATIME; + } + + /* 'fill_super()' opens ubi again so we must close it here */ + ubi_close_volume(ubi); + + return simple_set_mnt(mnt, sb); + +out_deact: + up_write(&sb->s_umount); + deactivate_super(sb); +out_close: + ubi_close_volume(ubi); + return err; +} + +static void ubifs_kill_sb(struct super_block *sb) +{ + generic_shutdown_super(sb); +} + +static struct file_system_type ubifs_fs_type = { + .name = "ubifs", + .owner = THIS_MODULE, + .get_sb = ubifs_get_sb, + .kill_sb = ubifs_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; + +/* + * Inode slab cache constructor. + */ +static void inode_slab_ctor(struct kmem_cache *cachep, void *obj) +{ + struct ubifs_inode *ui = obj; + inode_init_once(&ui->vfs_inode); +} + +static int __init ubifs_init(void) +{ + int err; + + BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24); + + /* Make sure node sizes are 8-byte aligned */ + BUILD_BUG_ON(UBIFS_CH_SZ & 7); + BUILD_BUG_ON(UBIFS_INO_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_DENT_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_XENT_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_DATA_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_SB_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MST_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_REF_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_CS_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_ORPH_NODE_SZ & 7); + + BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ & 7); + BUILD_BUG_ON(UBIFS_MAX_NODE_SZ & 7); + BUILD_BUG_ON(MIN_WRITE_SZ & 7); + + /* Check min. node size */ + BUILD_BUG_ON(UBIFS_INO_NODE_SZ < MIN_WRITE_SZ); + BUILD_BUG_ON(UBIFS_DENT_NODE_SZ < MIN_WRITE_SZ); + BUILD_BUG_ON(UBIFS_XENT_NODE_SZ < MIN_WRITE_SZ); + BUILD_BUG_ON(UBIFS_TRUN_NODE_SZ < MIN_WRITE_SZ); + + BUILD_BUG_ON(UBIFS_MAX_DENT_NODE_SZ > UBIFS_MAX_NODE_SZ); + BUILD_BUG_ON(UBIFS_MAX_XENT_NODE_SZ > UBIFS_MAX_NODE_SZ); + BUILD_BUG_ON(UBIFS_MAX_DATA_NODE_SZ > UBIFS_MAX_NODE_SZ); + BUILD_BUG_ON(UBIFS_MAX_INO_NODE_SZ > UBIFS_MAX_NODE_SZ); + + /* Defined node sizes */ + BUILD_BUG_ON(UBIFS_SB_NODE_SZ != 4096); + BUILD_BUG_ON(UBIFS_MST_NODE_SZ != 512); + BUILD_BUG_ON(UBIFS_INO_NODE_SZ != 160); + BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64); + + /* + * We use 2 bit wide bit-fields to store compression type, which should + * be amended if more compressors are added. The bit-fields are: + * @compr_type in 'struct ubifs_inode', @default_compr in + * 'struct ubifs_info' and @compr_type in 'struct ubifs_mount_opts'. + */ + BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4); + + /* + * We require that PAGE_CACHE_SIZE is greater-than-or-equal-to + * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2. + */ + if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) { + ubifs_err("VFS page cache size is %u bytes, but UBIFS requires" + " at least 4096 bytes", + (unsigned int)PAGE_CACHE_SIZE); + return -EINVAL; + } + + err = register_filesystem(&ubifs_fs_type); + if (err) { + ubifs_err("cannot register file system, error %d", err); + return err; + } + + err = -ENOMEM; + ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", + sizeof(struct ubifs_inode), 0, + SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT, + &inode_slab_ctor); + if (!ubifs_inode_slab) + goto out_reg; + + register_shrinker(&ubifs_shrinker_info); + + err = ubifs_compressors_init(); + if (err) + goto out_shrinker; + + err = dbg_debugfs_init(); + if (err) + goto out_compr; + + return 0; + +out_compr: + ubifs_compressors_exit(); +out_shrinker: + unregister_shrinker(&ubifs_shrinker_info); + kmem_cache_destroy(ubifs_inode_slab); +out_reg: + unregister_filesystem(&ubifs_fs_type); + return err; +} +/* late_initcall to let compressors initialize first */ +late_initcall(ubifs_init); + +static void __exit ubifs_exit(void) +{ + ubifs_assert(list_empty(&ubifs_infos)); + ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0); + + dbg_debugfs_exit(); + ubifs_compressors_exit(); + unregister_shrinker(&ubifs_shrinker_info); + kmem_cache_destroy(ubifs_inode_slab); + unregister_filesystem(&ubifs_fs_type); +} +module_exit(ubifs_exit); + +MODULE_LICENSE("GPL"); +MODULE_VERSION(__stringify(UBIFS_VERSION)); +MODULE_AUTHOR("Artem Bityutskiy, Adrian Hunter"); +MODULE_DESCRIPTION("UBIFS - UBI File System"); diff -Nurd linux-2.6.24/fs/ubifs/tnc.c ubifs-v2.6.24/fs/ubifs/tnc.c --- linux-2.6.24/fs/ubifs/tnc.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/tnc.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,3270 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file implements TNC (Tree Node Cache) which caches indexing nodes of + * the UBIFS B-tree. + * + * At the moment the locking rules of the TNC tree are quite simple and + * straightforward. We just have a mutex and lock it when we traverse the + * tree. If a znode is not in memory, we read it from flash while still having + * the mutex locked. + */ + +#include +#include "ubifs.h" + +/* + * Returned codes of 'matches_name()' and 'fallible_matches_name()' functions. + * @NAME_LESS: name corresponding to the first argument is less than second + * @NAME_MATCHES: names match + * @NAME_GREATER: name corresponding to the second argument is greater than + * first + * @NOT_ON_MEDIA: node referred by zbranch does not exist on the media + * + * These constants were introduce to improve readability. + */ +enum { + NAME_LESS = 0, + NAME_MATCHES = 1, + NAME_GREATER = 2, + NOT_ON_MEDIA = 3, +}; + +/** + * insert_old_idx - record an index node obsoleted since the last commit start. + * @c: UBIFS file-system description object + * @lnum: LEB number of obsoleted index node + * @offs: offset of obsoleted index node + * + * Returns %0 on success, and a negative error code on failure. + * + * For recovery, there must always be a complete intact version of the index on + * flash at all times. That is called the "old index". It is the index as at the + * time of the last successful commit. Many of the index nodes in the old index + * may be dirty, but they must not be erased until the next successful commit + * (at which point that index becomes the old index). + * + * That means that the garbage collection and the in-the-gaps method of + * committing must be able to determine if an index node is in the old index. + * Most of the old index nodes can be found by looking up the TNC using the + * 'lookup_znode()' function. However, some of the old index nodes may have + * been deleted from the current index or may have been changed so much that + * they cannot be easily found. In those cases, an entry is added to an RB-tree. + * That is what this function does. The RB-tree is ordered by LEB number and + * offset because they uniquely identify the old index node. + */ +static int insert_old_idx(struct ubifs_info *c, int lnum, int offs) +{ + struct ubifs_old_idx *old_idx, *o; + struct rb_node **p, *parent = NULL; + + old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS); + if (unlikely(!old_idx)) + return -ENOMEM; + old_idx->lnum = lnum; + old_idx->offs = offs; + + p = &c->old_idx.rb_node; + while (*p) { + parent = *p; + o = rb_entry(parent, struct ubifs_old_idx, rb); + if (lnum < o->lnum) + p = &(*p)->rb_left; + else if (lnum > o->lnum) + p = &(*p)->rb_right; + else if (offs < o->offs) + p = &(*p)->rb_left; + else if (offs > o->offs) + p = &(*p)->rb_right; + else { + ubifs_err("old idx added twice!"); + kfree(old_idx); + return 0; + } + } + rb_link_node(&old_idx->rb, parent, p); + rb_insert_color(&old_idx->rb, &c->old_idx); + return 0; +} + +/** + * insert_old_idx_znode - record a znode obsoleted since last commit start. + * @c: UBIFS file-system description object + * @znode: znode of obsoleted index node + * + * Returns %0 on success, and a negative error code on failure. + */ +int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode) +{ + if (znode->parent) { + struct ubifs_zbranch *zbr; + + zbr = &znode->parent->zbranch[znode->iip]; + if (zbr->len) + return insert_old_idx(c, zbr->lnum, zbr->offs); + } else + if (c->zroot.len) + return insert_old_idx(c, c->zroot.lnum, + c->zroot.offs); + return 0; +} + +/** + * ins_clr_old_idx_znode - record a znode obsoleted since last commit start. + * @c: UBIFS file-system description object + * @znode: znode of obsoleted index node + * + * Returns %0 on success, and a negative error code on failure. + */ +static int ins_clr_old_idx_znode(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + int err; + + if (znode->parent) { + struct ubifs_zbranch *zbr; + + zbr = &znode->parent->zbranch[znode->iip]; + if (zbr->len) { + err = insert_old_idx(c, zbr->lnum, zbr->offs); + if (err) + return err; + zbr->lnum = 0; + zbr->offs = 0; + zbr->len = 0; + } + } else + if (c->zroot.len) { + err = insert_old_idx(c, c->zroot.lnum, c->zroot.offs); + if (err) + return err; + c->zroot.lnum = 0; + c->zroot.offs = 0; + c->zroot.len = 0; + } + return 0; +} + +/** + * destroy_old_idx - destroy the old_idx RB-tree. + * @c: UBIFS file-system description object + * + * During start commit, the old_idx RB-tree is used to avoid overwriting index + * nodes that were in the index last commit but have since been deleted. This + * is necessary for recovery i.e. the old index must be kept intact until the + * new index is successfully written. The old-idx RB-tree is used for the + * in-the-gaps method of writing index nodes and is destroyed every commit. + */ +void destroy_old_idx(struct ubifs_info *c) +{ + struct rb_node *this = c->old_idx.rb_node; + struct ubifs_old_idx *old_idx; + + while (this) { + if (this->rb_left) { + this = this->rb_left; + continue; + } else if (this->rb_right) { + this = this->rb_right; + continue; + } + old_idx = rb_entry(this, struct ubifs_old_idx, rb); + this = rb_parent(this); + if (this) { + if (this->rb_left == &old_idx->rb) + this->rb_left = NULL; + else + this->rb_right = NULL; + } + kfree(old_idx); + } + c->old_idx = RB_ROOT; +} + +/** + * copy_znode - copy a dirty znode. + * @c: UBIFS file-system description object + * @znode: znode to copy + * + * A dirty znode being committed may not be changed, so it is copied. + */ +static struct ubifs_znode *copy_znode(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + struct ubifs_znode *zn; + + zn = kmalloc(c->max_znode_sz, GFP_NOFS); + if (unlikely(!zn)) + return ERR_PTR(-ENOMEM); + + memcpy(zn, znode, c->max_znode_sz); + zn->cnext = NULL; + __set_bit(DIRTY_ZNODE, &zn->flags); + __clear_bit(COW_ZNODE, &zn->flags); + + ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags)); + __set_bit(OBSOLETE_ZNODE, &znode->flags); + + if (znode->level != 0) { + int i; + const int n = zn->child_cnt; + + /* The children now have new parent */ + for (i = 0; i < n; i++) { + struct ubifs_zbranch *zbr = &zn->zbranch[i]; + + if (zbr->znode) + zbr->znode->parent = zn; + } + } + + atomic_long_inc(&c->dirty_zn_cnt); + return zn; +} + +/** + * add_idx_dirt - add dirt due to a dirty znode. + * @c: UBIFS file-system description object + * @lnum: LEB number of index node + * @dirt: size of index node + * + * This function updates lprops dirty space and the new size of the index. + */ +static int add_idx_dirt(struct ubifs_info *c, int lnum, int dirt) +{ + c->calc_idx_sz -= ALIGN(dirt, 8); + return ubifs_add_dirt(c, lnum, dirt); +} + +/** + * dirty_cow_znode - ensure a znode is not being committed. + * @c: UBIFS file-system description object + * @zbr: branch of znode to check + * + * Returns dirtied znode on success or negative error code on failure. + */ +static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c, + struct ubifs_zbranch *zbr) +{ + struct ubifs_znode *znode = zbr->znode; + struct ubifs_znode *zn; + int err; + + if (!test_bit(COW_ZNODE, &znode->flags)) { + /* znode is not being committed */ + if (!test_and_set_bit(DIRTY_ZNODE, &znode->flags)) { + atomic_long_inc(&c->dirty_zn_cnt); + atomic_long_dec(&c->clean_zn_cnt); + atomic_long_dec(&ubifs_clean_zn_cnt); + err = add_idx_dirt(c, zbr->lnum, zbr->len); + if (unlikely(err)) + return ERR_PTR(err); + } + return znode; + } + + zn = copy_znode(c, znode); + if (IS_ERR(zn)) + return zn; + + if (zbr->len) { + err = insert_old_idx(c, zbr->lnum, zbr->offs); + if (unlikely(err)) + return ERR_PTR(err); + err = add_idx_dirt(c, zbr->lnum, zbr->len); + } else + err = 0; + + zbr->znode = zn; + zbr->lnum = 0; + zbr->offs = 0; + zbr->len = 0; + + if (unlikely(err)) + return ERR_PTR(err); + return zn; +} + +/** + * lnc_add - add a leaf node to the leaf node cache. + * @c: UBIFS file-system description object + * @zbr: zbranch of leaf node + * @node: leaf node + * + * Leaf nodes are non-index nodes directory entry nodes or data nodes. The + * purpose of the leaf node cache is to save re-reading the same leaf node over + * and over again. Most things are cached by VFS, however the file system must + * cache directory entries for readdir and for resolving hash collisions. The + * present implementation of the leaf node cache is extremely simple, and + * allows for error returns that are not used but that may be needed if a more + * complex implementation is created. + * + * Note, this function does not add the @node object to LNC directly, but + * allocates a copy of the object and adds the copy to LNC. The reason for this + * is that @node has been allocated outside of the TNC subsystem and will be + * used with @c->tnc_mutex unlock upon return from the TNC subsystem. But LNC + * may be changed at any time, e.g. freed by the shrinker. + */ +static int lnc_add(struct ubifs_info *c, struct ubifs_zbranch *zbr, + const void *node) +{ + int err; + void *lnc_node; + const struct ubifs_dent_node *dent = node; + + ubifs_assert(!zbr->leaf); + ubifs_assert(zbr->len != 0); + ubifs_assert(is_hash_key(c, &zbr->key)); + + err = ubifs_validate_entry(c, dent); + if (err) { + dbg_dump_stack(); + dbg_dump_node(c, dent); + return err; + } + + lnc_node = kmalloc(zbr->len, GFP_NOFS); + if (!lnc_node) + /* We don't have to have the cache, so no error */ + return 0; + + memcpy(lnc_node, node, zbr->len); + zbr->leaf = lnc_node; + return 0; +} + + /** + * lnc_add_directly - add a leaf node to the leaf-node-cache. + * @c: UBIFS file-system description object + * @zbr: zbranch of leaf node + * @node: leaf node + * + * This function is similar to 'lnc_add()', but it does not create a copy of + * @node but inserts @node to TNC directly. + */ +static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *node) +{ + int err; + + ubifs_assert(!zbr->leaf); + ubifs_assert(zbr->len != 0); + + err = ubifs_validate_entry(c, node); + if (err) { + dbg_dump_stack(); + dbg_dump_node(c, node); + return err; + } + + zbr->leaf = node; + return 0; +} + +/** + * lnc_free - remove a leaf node from the leaf node cache. + * @zbr: zbranch of leaf node + * @node: leaf node + */ +static void lnc_free(struct ubifs_zbranch *zbr) +{ + if (!zbr->leaf) + return; + kfree(zbr->leaf); + zbr->leaf = NULL; +} + +/** + * tnc_read_node_nm - read a "hashed" leaf node. + * @c: UBIFS file-system description object + * @zbr: key and position of the node + * @node: node is returned here + * + * This function reads a "hashed" node defined by @zbr from the leaf node cache + * (in it is there) or from the hash media, in which case the node is also + * added to LNC. Returns zero in case of success or a negative negative error + * code in case of failure. + */ +static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *node) +{ + int err; + + ubifs_assert(is_hash_key(c, &zbr->key)); + + if (zbr->leaf) { + /* Read from the leaf node cache */ + ubifs_assert(zbr->len != 0); + memcpy(node, zbr->leaf, zbr->len); + return 0; + } + + err = ubifs_tnc_read_node(c, zbr, node); + if (err) + return err; + + /* Add the node to the leaf node cache */ + err = lnc_add(c, zbr, node); + return err; +} + +/** + * try_read_node - read a node if it is a node. + * @c: UBIFS file-system description object + * @buf: buffer to read to + * @type: node type + * @len: node length (not aligned) + * @lnum: LEB number of node to read + * @offs: offset of node to read + * + * This function tries to read a node of known type and length, checks it and + * stores it in @buf. This function returns %1 if a node is present and %0 if + * a node is not present. A negative error code is returned for I/O errors. + * This function performs that same function as ubifs_read_node except that + * it does not require that there is actually a node present and instead + * the return code indicates if a node was read. + * + * Note, this function does not check CRC of data nodes if @c->no_chk_data_crc + * is true (it is controlled by corresponding mount option). However, if + * @c->always_chk_crc is true, @c->no_chk_data_crc is ignored and CRC is always + * checked. + */ +static int try_read_node(const struct ubifs_info *c, void *buf, int type, + int len, int lnum, int offs) +{ + int err, node_len; + struct ubifs_ch *ch = buf; + uint32_t crc, node_crc; + + dbg_io("LEB %d:%d, %s, length %d", lnum, offs, dbg_ntype(type), len); + + err = ubi_read(c->ubi, lnum, buf, offs, len); + if (err) { + ubifs_err("cannot read node type %d from LEB %d:%d, error %d", + type, lnum, offs, err); + return err; + } + + if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) + return 0; + + if (ch->node_type != type) + return 0; + + node_len = le32_to_cpu(ch->len); + if (node_len != len) + return 0; + + if (type == UBIFS_DATA_NODE && !c->always_chk_crc && c->no_chk_data_crc) + return 1; + + crc = crc32(UBIFS_CRC32_INIT, buf + 8, node_len - 8); + node_crc = le32_to_cpu(ch->crc); + if (crc != node_crc) + return 0; + + return 1; +} + +/** + * fallible_read_node - try to read a leaf node. + * @c: UBIFS file-system description object + * @key: key of node to read + * @zbr: position of node + * @node: node returned + * + * This function tries to read a node and returns %1 if the node is read, %0 + * if the node is not present, and a negative error code in the case of error. + */ +static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_zbranch *zbr, void *node) +{ + int ret; + + dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key)); + + ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum, + zbr->offs); + if (ret == 1) { + union ubifs_key node_key; + struct ubifs_dent_node *dent = node; + + /* All nodes have key in the same place */ + key_read(c, &dent->key, &node_key); + if (keys_cmp(c, key, &node_key) != 0) + ret = 0; + } + if (ret == 0 && c->replaying) + dbg_mnt("dangling branch LEB %d:%d len %d, key %s", + zbr->lnum, zbr->offs, zbr->len, DBGKEY(key)); + return ret; +} + +/** + * matches_name - determine if a direntry or xattr entry matches a given name. + * @c: UBIFS file-system description object + * @zbr: zbranch of dent + * @nm: name to match + * + * This function checks if xentry/direntry referred by zbranch @zbr matches name + * @nm. Returns %NAME_MATCHES if it does, %NAME_LESS if the name referred by + * @zbr is less than @nm, and %NAME_GREATER if it is greater than @nm. In case + * of failure, a negative error code is returned. + */ +static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr, + const struct qstr *nm) +{ + struct ubifs_dent_node *dent; + int nlen, err; + + /* If possible, match against the dent in the leaf node cache */ + if (!zbr->leaf) { + dent = kmalloc(zbr->len, GFP_NOFS); + if (!dent) + return -ENOMEM; + + err = ubifs_tnc_read_node(c, zbr, dent); + if (err) + goto out_free; + + /* Add the node to the leaf node cache */ + err = lnc_add_directly(c, zbr, dent); + if (err) + goto out_free; + } else + dent = zbr->leaf; + + nlen = le16_to_cpu(dent->nlen); + err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len)); + if (err == 0) { + if (nlen == nm->len) + return NAME_MATCHES; + else if (nlen < nm->len) + return NAME_LESS; + else + return NAME_GREATER; + } else if (err < 0) + return NAME_LESS; + else + return NAME_GREATER; + +out_free: + kfree(dent); + return err; +} + +/** + * get_znode - get a TNC znode that may not be loaded yet. + * @c: UBIFS file-system description object + * @znode: parent znode + * @n: znode branch slot number + * + * This function returns the znode or a negative error code. + */ +static struct ubifs_znode *get_znode(struct ubifs_info *c, + struct ubifs_znode *znode, int n) +{ + struct ubifs_zbranch *zbr; + + zbr = &znode->zbranch[n]; + if (zbr->znode) + znode = zbr->znode; + else + znode = ubifs_load_znode(c, zbr, znode, n); + return znode; +} + +/** + * tnc_next - find next TNC entry. + * @c: UBIFS file-system description object + * @zn: znode is passed and returned here + * @n: znode branch slot number is passed and returned here + * + * This function returns %0 if the next TNC entry is found, %-ENOENT if there is + * no next entry, or a negative error code otherwise. + */ +static int tnc_next(struct ubifs_info *c, struct ubifs_znode **zn, int *n) +{ + struct ubifs_znode *znode = *zn; + int nn = *n; + + nn += 1; + if (nn < znode->child_cnt) { + *n = nn; + return 0; + } + while (1) { + struct ubifs_znode *zp; + + zp = znode->parent; + if (!zp) + return -ENOENT; + nn = znode->iip + 1; + znode = zp; + if (nn < znode->child_cnt) { + znode = get_znode(c, znode, nn); + if (IS_ERR(znode)) + return PTR_ERR(znode); + while (znode->level != 0) { + znode = get_znode(c, znode, 0); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + nn = 0; + break; + } + } + *zn = znode; + *n = nn; + return 0; +} + +/** + * tnc_prev - find previous TNC entry. + * @c: UBIFS file-system description object + * @zn: znode is returned here + * @n: znode branch slot number is passed and returned here + * + * This function returns %0 if the previous TNC entry is found, %-ENOENT if + * there is no next entry, or a negative error code otherwise. + */ +static int tnc_prev(struct ubifs_info *c, struct ubifs_znode **zn, int *n) +{ + struct ubifs_znode *znode = *zn; + int nn = *n; + + if (nn > 0) { + *n = nn - 1; + return 0; + } + while (1) { + struct ubifs_znode *zp; + + zp = znode->parent; + if (!zp) + return -ENOENT; + nn = znode->iip - 1; + znode = zp; + if (nn >= 0) { + znode = get_znode(c, znode, nn); + if (IS_ERR(znode)) + return PTR_ERR(znode); + while (znode->level != 0) { + nn = znode->child_cnt - 1; + znode = get_znode(c, znode, nn); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + nn = znode->child_cnt - 1; + break; + } + } + *zn = znode; + *n = nn; + return 0; +} + +/** + * resolve_collision - resolve a collision. + * @c: UBIFS file-system description object + * @key: key of a directory or extended attribute entry + * @zn: znode is returned here + * @n: zbranch number is passed and returned here + * @nm: name of the entry + * + * This function is called for "hashed" keys to make sure that the found key + * really corresponds to the looked up node (directory or extended attribute + * entry). It returns %1 and sets @zn and @n if the collision is resolved. + * %0 is returned if @nm is not found and @zn and @n are set to the previous + * entry, i.e. to the entry after which @nm could follow if it were in TNC. + * This means that @n may be set to %-1 if the leftmost key in @zn is the + * previous one. A negative error code is returned on failures. + */ +static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_znode **zn, int *n, + const struct qstr *nm) +{ + int err; + + err = matches_name(c, &(*zn)->zbranch[*n], nm); + if (unlikely(err < 0)) + return err; + if (err == NAME_MATCHES) + return 1; + + if (err == NAME_GREATER) { + /* Look left */ + while (1) { + err = tnc_prev(c, zn, n); + if (err == -ENOENT) { + ubifs_assert(*n == 0); + *n = -1; + return 0; + } + if (err < 0) + return err; + if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) { + /* + * We have found the branch after which we would + * like to insert, but inserting in this znode + * may still be wrong. Consider the following 3 + * znodes, in the case where we are resolving a + * collision with Key2. + * + * znode zp + * ---------------------- + * level 1 | Key0 | Key1 | + * ----------------------- + * | | + * znode za | | znode zb + * ------------ ------------ + * level 0 | Key0 | | Key2 | + * ------------ ------------ + * + * The lookup finds Key2 in znode zb. Lets say + * there is no match and the name is greater so + * we look left. When we find Key0, we end up + * here. If we return now, we will insert into + * znode za at slot n = 1. But that is invalid + * according to the parent's keys. Key2 must + * be inserted into znode zb. + * + * Note, this problem is not relevant for the + * case when we go right, because + * 'tnc_insert()' would correct the parent key. + */ + if (*n == (*zn)->child_cnt - 1) { + err = tnc_next(c, zn, n); + if (err) { + /* Should be impossible */ + ubifs_assert(0); + if (err == -ENOENT) + err = -EINVAL; + return err; + } + ubifs_assert(*n == 0); + *n = -1; + } + return 0; + } + err = matches_name(c, &(*zn)->zbranch[*n], nm); + if (err < 0) + return err; + if (err == NAME_LESS) + return 0; + if (err == NAME_MATCHES) + return 1; + ubifs_assert(err == NAME_GREATER); + } + } else { + int nn = *n; + struct ubifs_znode *znode = *zn; + + /* Look right */ + while (1) { + err = tnc_next(c, &znode, &nn); + if (err == -ENOENT) + return 0; + if (err < 0) + return err; + if (keys_cmp(c, &znode->zbranch[nn].key, key)) + return 0; + err = matches_name(c, &znode->zbranch[nn], nm); + if (err < 0) + return err; + if (err == NAME_GREATER) + return 0; + *zn = znode; + *n = nn; + if (err == NAME_MATCHES) + return 1; + ubifs_assert(err == NAME_LESS); + } + } +} + +/** + * fallible_matches_name - determine if a dent matches a given name. + * @c: UBIFS file-system description object + * @zbr: zbranch of dent + * @nm: name to match + * + * This is a "fallible" version of 'matches_name()' function which does not + * panic if the direntry/xentry referred by @zbr does not exist on the media. + * + * This function checks if xentry/direntry referred by zbranch @zbr matches name + * @nm. Returns %NAME_MATCHES it does, %NAME_LESS if the name referred by @zbr + * is less than @nm, %NAME_GREATER if it is greater than @nm, and @NOT_ON_MEDIA + * if xentry/direntry referred by @zbr does not exist on the media. A negative + * error code is returned in case of failure. + */ +static int fallible_matches_name(struct ubifs_info *c, + struct ubifs_zbranch *zbr, + const struct qstr *nm) +{ + struct ubifs_dent_node *dent; + int nlen, err; + + /* If possible, match against the dent in the leaf node cache */ + if (!zbr->leaf) { + dent = kmalloc(zbr->len, GFP_NOFS); + if (!dent) + return -ENOMEM; + + err = fallible_read_node(c, &zbr->key, zbr, dent); + if (err < 0) + goto out_free; + if (err == 0) { + /* The node was not present */ + err = NOT_ON_MEDIA; + goto out_free; + } + ubifs_assert(err == 1); + + err = lnc_add_directly(c, zbr, dent); + if (err) + goto out_free; + } else + dent = zbr->leaf; + + nlen = le16_to_cpu(dent->nlen); + err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len)); + if (err == 0) { + if (nlen == nm->len) + return NAME_MATCHES; + else if (nlen < nm->len) + return NAME_LESS; + else + return NAME_GREATER; + } else if (err < 0) + return NAME_LESS; + else + return NAME_GREATER; + +out_free: + kfree(dent); + return err; +} + +/** + * fallible_resolve_collision - resolve a collision even if nodes are missing. + * @c: UBIFS file-system description object + * @key: key + * @zn: znode is returned here + * @n: branch number is passed and returned here + * @nm: name of directory entry + * @adding: indicates caller is adding a key to the TNC + * + * This is a "fallible" version of the 'resolve_collision()' function which + * does not panic if one of the nodes referred to by TNC does not exist on the + * media. This may happen when replaying the journal if a deleted node was + * Garbage-collected and the commit was not done. A branch that refers to a node + * that is not present is called a dangling branch. The following are the return + * codes for this function: + * o if @nm was found, %1 is returned and @zn and @n are set to the found + * branch; + * o if we are @adding and @nm was not found, %0 is returned; + * o if we are not @adding and @nm was not found, but a dangling branch was + * found, then %1 is returned and @zn and @n are set to the dangling branch; + * o a negative error code is returned in case of failure. + */ +static int fallible_resolve_collision(struct ubifs_info *c, + const union ubifs_key *key, + struct ubifs_znode **zn, int *n, + const struct qstr *nm, int adding) +{ + struct ubifs_znode *o_znode = NULL, *znode = *zn; + int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n; + + cmp = fallible_matches_name(c, &znode->zbranch[nn], nm); + if (unlikely(cmp < 0)) + return cmp; + if (cmp == NAME_MATCHES) + return 1; + if (cmp == NOT_ON_MEDIA) { + o_znode = znode; + o_n = nn; + /* + * We are unlucky and hit a dangling branch straight away. + * Now we do not really know where to go to find the needed + * branch - to the left or to the right. Well, let's try left. + */ + unsure = 1; + } else if (!adding) + unsure = 1; /* Remove a dangling branch wherever it is */ + + if (cmp == NAME_GREATER || unsure) { + /* Look left */ + while (1) { + err = tnc_prev(c, zn, n); + if (err == -ENOENT) { + ubifs_assert(*n == 0); + *n = -1; + break; + } + if (err < 0) + return err; + if (keys_cmp(c, &(*zn)->zbranch[*n].key, key)) { + /* See comments in 'resolve_collision()' */ + if (*n == (*zn)->child_cnt - 1) { + err = tnc_next(c, zn, n); + if (err) { + /* Should be impossible */ + ubifs_assert(0); + if (err == -ENOENT) + err = -EINVAL; + return err; + } + ubifs_assert(*n == 0); + *n = -1; + } + break; + } + err = fallible_matches_name(c, &(*zn)->zbranch[*n], nm); + if (err < 0) + return err; + if (err == NAME_MATCHES) + return 1; + if (err == NOT_ON_MEDIA) { + o_znode = *zn; + o_n = *n; + continue; + } + if (!adding) + continue; + if (err == NAME_LESS) + break; + else + unsure = 0; + } + } + + if (cmp == NAME_LESS || unsure) { + /* Look right */ + *zn = znode; + *n = nn; + while (1) { + err = tnc_next(c, &znode, &nn); + if (err == -ENOENT) + break; + if (err < 0) + return err; + if (keys_cmp(c, &znode->zbranch[nn].key, key)) + break; + err = fallible_matches_name(c, &znode->zbranch[nn], nm); + if (err < 0) + return err; + if (err == NAME_GREATER) + break; + *zn = znode; + *n = nn; + if (err == NAME_MATCHES) + return 1; + if (err == NOT_ON_MEDIA) { + o_znode = znode; + o_n = nn; + } + } + } + + /* Never match a dangling branch when adding */ + if (adding || !o_znode) + return 0; + + dbg_mnt("dangling match LEB %d:%d len %d %s", + o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs, + o_znode->zbranch[o_n].len, DBGKEY(key)); + *zn = o_znode; + *n = o_n; + return 1; +} + +/** + * matches_position - determine if a zbranch matches a given position. + * @zbr: zbranch of dent + * @lnum: LEB number of dent to match + * @offs: offset of dent to match + * + * This function returns %1 if @lnum:@offs matches, and %0 otherwise. + */ +static int matches_position(struct ubifs_zbranch *zbr, int lnum, int offs) +{ + if (zbr->lnum == lnum && zbr->offs == offs) + return 1; + else + return 0; +} + +/** + * resolve_collision_directly - resolve a collision directly. + * @c: UBIFS file-system description object + * @key: key of directory entry + * @zn: znode is passed and returned here + * @n: zbranch number is passed and returned here + * @lnum: LEB number of dent node to match + * @offs: offset of dent node to match + * + * This function is used for "hashed" keys to make sure the found directory or + * extended attribute entry node is what was looked for. It is used when the + * flash address of the right node is known (@lnum:@offs) which makes it much + * easier to resolve collisions (no need to read entries and match full + * names). This function returns %1 and sets @zn and @n if the collision is + * resolved, %0 if @lnum:@offs is not found and @zn and @n are set to the + * previous directory entry. Otherwise a negative error code is returned. + */ +static int resolve_collision_directly(struct ubifs_info *c, + const union ubifs_key *key, + struct ubifs_znode **zn, int *n, + int lnum, int offs) +{ + struct ubifs_znode *znode; + int nn, err; + + znode = *zn; + nn = *n; + if (matches_position(&znode->zbranch[nn], lnum, offs)) + return 1; + + /* Look left */ + while (1) { + err = tnc_prev(c, &znode, &nn); + if (err == -ENOENT) + break; + if (err < 0) + return err; + if (keys_cmp(c, &znode->zbranch[nn].key, key)) + break; + if (matches_position(&znode->zbranch[nn], lnum, offs)) { + *zn = znode; + *n = nn; + return 1; + } + } + + /* Look right */ + znode = *zn; + nn = *n; + while (1) { + err = tnc_next(c, &znode, &nn); + if (err == -ENOENT) + return 0; + if (err < 0) + return err; + if (keys_cmp(c, &znode->zbranch[nn].key, key)) + return 0; + *zn = znode; + *n = nn; + if (matches_position(&znode->zbranch[nn], lnum, offs)) + return 1; + } +} + +/** + * dirty_cow_bottom_up - dirty a znode and its ancestors. + * @c: UBIFS file-system description object + * @znode: znode to dirty + * + * If we do not have a unique key that resides in a znode, then we cannot + * dirty that znode from the top down (i.e. by using lookup_level0_dirty) + * This function records the path back to the last dirty ancestor, and then + * dirties the znodes on that path. + */ +static struct ubifs_znode *dirty_cow_bottom_up(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + struct ubifs_znode *zp; + int *path = c->bottom_up_buf, p = 0; + + ubifs_assert(c->zroot.znode); + ubifs_assert(znode); + if (c->zroot.znode->level > BOTTOM_UP_HEIGHT) { + kfree(c->bottom_up_buf); + c->bottom_up_buf = kmalloc(c->zroot.znode->level * sizeof(int), + GFP_NOFS); + if (!c->bottom_up_buf) + return ERR_PTR(-ENOMEM); + path = c->bottom_up_buf; + } + if (c->zroot.znode->level) { + /* Go up until parent is dirty */ + while (1) { + int n; + + zp = znode->parent; + if (!zp) + break; + n = znode->iip; + ubifs_assert(p < c->zroot.znode->level); + path[p++] = n; + if (!zp->cnext && ubifs_zn_dirty(znode)) + break; + znode = zp; + } + } + + /* Come back down, dirtying as we go */ + while (1) { + struct ubifs_zbranch *zbr; + + zp = znode->parent; + if (zp) { + ubifs_assert(path[p - 1] >= 0); + ubifs_assert(path[p - 1] < zp->child_cnt); + zbr = &zp->zbranch[path[--p]]; + znode = dirty_cow_znode(c, zbr); + } else { + ubifs_assert(znode == c->zroot.znode); + znode = dirty_cow_znode(c, &c->zroot); + } + if (IS_ERR(znode) || !p) + break; + ubifs_assert(path[p - 1] >= 0); + ubifs_assert(path[p - 1] < znode->child_cnt); + znode = znode->zbranch[path[p - 1]].znode; + } + + return znode; +} + +/** + * ubifs_lookup_level0 - search for zero-level znode. + * @c: UBIFS file-system description object + * @key: key to lookup + * @zn: znode is returned here + * @n: znode branch slot number is returned here + * + * This function looks up the TNC tree and search for zero-level znode which + * refers key @key. The found zero-level znode is returned in @zn. There are 3 + * cases: + * o exact match, i.e. the found zero-level znode contains key @key, then %1 + * is returned and slot number of the matched branch is stored in @n; + * o not exact match, which means that zero-level znode does not contain + * @key, then %0 is returned and slot number of the closed branch is stored + * in @n; + * o @key is so small that it is even less than the lowest key of the + * leftmost zero-level node, then %0 is returned and %0 is stored in @n. + * + * Note, when the TNC tree is traversed, some znodes may be absent, then this + * function reads corresponding indexing nodes and inserts them to TNC. In + * case of failure, a negative error code is returned. + */ +int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_znode **zn, int *n) +{ + int err, exact; + struct ubifs_znode *znode; + unsigned long time = get_seconds(); + + dbg_tnc("search key %s", DBGKEY(key)); + + znode = c->zroot.znode; + if (unlikely(!znode)) { + znode = ubifs_load_znode(c, &c->zroot, NULL, 0); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + znode->time = time; + + while (1) { + struct ubifs_zbranch *zbr; + + exact = ubifs_search_zbranch(c, znode, key, n); + + if (znode->level == 0) + break; + + if (*n < 0) + *n = 0; + zbr = &znode->zbranch[*n]; + + if (zbr->znode) { + znode->time = time; + znode = zbr->znode; + continue; + } + + /* znode is not in TNC cache, load it from the media */ + znode = ubifs_load_znode(c, zbr, znode, *n); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + *zn = znode; + if (exact || !is_hash_key(c, key) || *n != -1) { + dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n); + return exact; + } + + /* + * Here is a tricky place. We have not found the key and this is a + * "hashed" key, which may collide. The rest of the code deals with + * situations like this: + * + * | 3 | 5 | + * / \ + * | 3 | 5 | | 6 | 7 | (x) + * + * Or more a complex example: + * + * | 1 | 5 | + * / \ + * | 1 | 3 | | 5 | 8 | + * \ / + * | 5 | 5 | | 6 | 7 | (x) + * + * In the examples, if we are looking for key "5", we may reach nodes + * marked with "(x)". In this case what we have do is to look at the + * left and see if there is "5" key there. If there is, we have to + * return it. + * + * Note, this whole situation is possible because we allow to have + * elements which are equivalent to the next key in the parent in the + * children of current znode. For example, this happens if we split a + * znode like this: | 3 | 5 | 5 | 6 | 7 |, which results in something + * like this: + * | 3 | 5 | + * / \ + * | 3 | 5 | | 5 | 6 | 7 | + * ^ + * And this becomes what is at the first "picture" after key "5" marked + * with "^" is removed. What could be done is we could prohibit + * splitting in the middle of the colliding sequence. Also, when + * removing the leftmost key, we would have to correct the key of the + * parent node, which would introduce additional complications. Namely, + * if we changed the leftmost key of the parent znode, the garbage + * collector would be unable to find it (GC is doing this when GC'ing + * indexing LEBs). Although we already have an additional RB-tree where + * we save such changed znodes (see 'ins_clr_old_idx_znode()') until + * after the commit. But anyway, this does not look easy to implement + * so we did not try this. + */ + err = tnc_prev(c, &znode, n); + if (err == -ENOENT) { + dbg_tnc("found 0, lvl %d, n -1", znode->level); + *n = -1; + return 0; + } + if (unlikely(err < 0)) + return err; + if (keys_cmp(c, key, &znode->zbranch[*n].key)) { + dbg_tnc("found 0, lvl %d, n -1", znode->level); + *n = -1; + return 0; + } + + dbg_tnc("found 1, lvl %d, n %d", znode->level, *n); + *zn = znode; + return 1; +} + +/** + * lookup_level0_dirty - search for zero-level znode dirtying. + * @c: UBIFS file-system description object + * @key: key to lookup + * @zn: znode is returned here + * @n: znode branch slot number is returned here + * + * This function looks up the TNC tree and search for zero-level znode which + * refers key @key. The found zero-level znode is returned in @zn. There are 3 + * cases: + * o exact match, i.e. the found zero-level znode contains key @key, then %1 + * is returned and slot number of the matched branch is stored in @n; + * o not exact match, which means that zero-level znode does not contain @key + * then %0 is returned and slot number of the closed branch is stored in + * @n; + * o @key is so small that it is even less than the lowest key of the + * leftmost zero-level node, then %0 is returned and %-1 is stored in @n. + * + * Additionally all znodes in the path from the root to the located zero-level + * znode are marked as dirty. + * + * Note, when the TNC tree is traversed, some znodes may be absent, then this + * function reads corresponding indexing nodes and inserts them to TNC. In + * case of failure, a negative error code is returned. + */ +static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_znode **zn, int *n) +{ + int err, exact; + struct ubifs_znode *znode; + unsigned long time = get_seconds(); + + dbg_tnc("search and dirty key %s", DBGKEY(key)); + + znode = c->zroot.znode; + if (unlikely(!znode)) { + znode = ubifs_load_znode(c, &c->zroot, NULL, 0); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + znode = dirty_cow_znode(c, &c->zroot); + if (IS_ERR(znode)) + return PTR_ERR(znode); + + znode->time = time; + + while (1) { + struct ubifs_zbranch *zbr; + + exact = ubifs_search_zbranch(c, znode, key, n); + + if (znode->level == 0) + break; + + if (*n < 0) + *n = 0; + zbr = &znode->zbranch[*n]; + + if (zbr->znode) { + znode->time = time; + znode = dirty_cow_znode(c, zbr); + if (IS_ERR(znode)) + return PTR_ERR(znode); + continue; + } + + /* znode is not in TNC cache, load it from the media */ + znode = ubifs_load_znode(c, zbr, znode, *n); + if (IS_ERR(znode)) + return PTR_ERR(znode); + znode = dirty_cow_znode(c, zbr); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + *zn = znode; + if (exact || !is_hash_key(c, key) || *n != -1) { + dbg_tnc("found %d, lvl %d, n %d", exact, znode->level, *n); + return exact; + } + + /* + * See huge comment at 'lookup_level0_dirty()' what is the rest of the + * code. + */ + err = tnc_prev(c, &znode, n); + if (err == -ENOENT) { + *n = -1; + dbg_tnc("found 0, lvl %d, n -1", znode->level); + return 0; + } + if (unlikely(err < 0)) + return err; + if (keys_cmp(c, key, &znode->zbranch[*n].key)) { + *n = -1; + dbg_tnc("found 0, lvl %d, n -1", znode->level); + return 0; + } + + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) + return PTR_ERR(znode); + } + + dbg_tnc("found 1, lvl %d, n %d", znode->level, *n); + *zn = znode; + return 1; +} + +/** + * maybe_leb_gced - determine if a LEB may have been garbage collected. + * @c: UBIFS file-system description object + * @lnum: LEB number + * @gc_seq1: garbage collection sequence number + * + * This function determines if @lnum may have been garbage collected since + * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise + * %0 is returned. + */ +static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1) +{ + int gc_seq2, gced_lnum; + + gced_lnum = c->gced_lnum; + smp_rmb(); + gc_seq2 = c->gc_seq; + /* Same seq means no GC */ + if (gc_seq1 == gc_seq2) + return 0; + /* Different by more than 1 means we don't know */ + if (gc_seq1 + 1 != gc_seq2) + return 1; + /* + * We have seen the sequence number has increased by 1. Now we need to + * be sure we read the right LEB number, so read it again. + */ + smp_rmb(); + if (gced_lnum != c->gced_lnum) + return 1; + /* Finally we can check lnum */ + if (gced_lnum == lnum) + return 1; + return 0; +} + +/** + * ubifs_tnc_locate - look up a file-system node and return it and its location. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * @lnum: LEB number is returned here + * @offs: offset is returned here + * + * This function look up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. The node location can be returned in @lnum and @offs. + */ +int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, + void *node, int *lnum, int *offs) +{ + int found, n, err, safely = 0, gc_seq1; + struct ubifs_znode *znode; + struct ubifs_zbranch zbr, *zt; + +again: + mutex_lock(&c->tnc_mutex); + found = ubifs_lookup_level0(c, key, &znode, &n); + if (!found) { + err = -ENOENT; + goto out; + } else if (found < 0) { + err = found; + goto out; + } + zt = &znode->zbranch[n]; + if (lnum) { + *lnum = zt->lnum; + *offs = zt->offs; + } + if (is_hash_key(c, key)) { + /* + * In this case the leaf node cache gets used, so we pass the + * address of the zbranch and keep the mutex locked + */ + err = tnc_read_node_nm(c, zt, node); + goto out; + } + if (safely) { + err = ubifs_tnc_read_node(c, zt, node); + goto out; + } + /* Drop the TNC mutex prematurely and race with garbage collection */ + zbr = znode->zbranch[n]; + gc_seq1 = c->gc_seq; + mutex_unlock(&c->tnc_mutex); + + if (ubifs_get_wbuf(c, zbr.lnum)) { + /* We do not GC journal heads */ + err = ubifs_tnc_read_node(c, &zbr, node); + return err; + } + + err = fallible_read_node(c, key, &zbr, node); + if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) { + /* + * The node may have been GC'ed out from under us so try again + * while keeping the TNC mutex locked. + */ + safely = 1; + goto again; + } + return 0; + +out: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_get_bu_keys - lookup keys for bulk-read. + * @c: UBIFS file-system description object + * @bu: bulk-read parameters and results + * + * Lookup consecutive data node keys for the same inode that reside + * consecutively in the same LEB. This function returns zero in case of success + * and a negative error code in case of failure. + * + * Note, if the bulk-read buffer length (@bu->buf_len) is known, this function + * makes sure bulk-read nodes fit the buffer. Otherwise, this function prepares + * maximum possible amount of nodes for bulk-read. + */ +int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu) +{ + int n, err = 0, lnum = -1, uninitialized_var(offs); + int uninitialized_var(len); + unsigned int block = key_block(c, &bu->key); + struct ubifs_znode *znode; + + bu->cnt = 0; + bu->blk_cnt = 0; + bu->eof = 0; + + mutex_lock(&c->tnc_mutex); + /* Find first key */ + err = ubifs_lookup_level0(c, &bu->key, &znode, &n); + if (err < 0) + goto out; + if (err) { + /* Key found */ + len = znode->zbranch[n].len; + /* The buffer must be big enough for at least 1 node */ + if (len > bu->buf_len) { + err = -EINVAL; + goto out; + } + /* Add this key */ + bu->zbranch[bu->cnt++] = znode->zbranch[n]; + bu->blk_cnt += 1; + lnum = znode->zbranch[n].lnum; + offs = ALIGN(znode->zbranch[n].offs + len, 8); + } + while (1) { + struct ubifs_zbranch *zbr; + union ubifs_key *key; + unsigned int next_block; + + /* Find next key */ + err = tnc_next(c, &znode, &n); + if (err) + goto out; + zbr = &znode->zbranch[n]; + key = &zbr->key; + /* See if there is another data key for this file */ + if (key_inum(c, key) != key_inum(c, &bu->key) || + key_type(c, key) != UBIFS_DATA_KEY) { + err = -ENOENT; + goto out; + } + if (lnum < 0) { + /* First key found */ + lnum = zbr->lnum; + offs = ALIGN(zbr->offs + zbr->len, 8); + len = zbr->len; + if (len > bu->buf_len) { + err = -EINVAL; + goto out; + } + } else { + /* + * The data nodes must be in consecutive positions in + * the same LEB. + */ + if (zbr->lnum != lnum || zbr->offs != offs) + goto out; + offs += ALIGN(zbr->len, 8); + len = ALIGN(len, 8) + zbr->len; + /* Must not exceed buffer length */ + if (len > bu->buf_len) + goto out; + } + /* Allow for holes */ + next_block = key_block(c, key); + bu->blk_cnt += (next_block - block - 1); + if (bu->blk_cnt >= UBIFS_MAX_BULK_READ) + goto out; + block = next_block; + /* Add this key */ + bu->zbranch[bu->cnt++] = *zbr; + bu->blk_cnt += 1; + /* See if we have room for more */ + if (bu->cnt >= UBIFS_MAX_BULK_READ) + goto out; + if (bu->blk_cnt >= UBIFS_MAX_BULK_READ) + goto out; + } +out: + if (err == -ENOENT) { + bu->eof = 1; + err = 0; + } + bu->gc_seq = c->gc_seq; + mutex_unlock(&c->tnc_mutex); + if (err) + return err; + /* + * An enormous hole could cause bulk-read to encompass too many + * page cache pages, so limit the number here. + */ + if (bu->blk_cnt > UBIFS_MAX_BULK_READ) + bu->blk_cnt = UBIFS_MAX_BULK_READ; + /* + * Ensure that bulk-read covers a whole number of page cache + * pages. + */ + if (UBIFS_BLOCKS_PER_PAGE == 1 || + !(bu->blk_cnt & (UBIFS_BLOCKS_PER_PAGE - 1))) + return 0; + if (bu->eof) { + /* At the end of file we can round up */ + bu->blk_cnt += UBIFS_BLOCKS_PER_PAGE - 1; + return 0; + } + /* Exclude data nodes that do not make up a whole page cache page */ + block = key_block(c, &bu->key) + bu->blk_cnt; + block &= ~(UBIFS_BLOCKS_PER_PAGE - 1); + while (bu->cnt) { + if (key_block(c, &bu->zbranch[bu->cnt - 1].key) < block) + break; + bu->cnt -= 1; + } + return 0; +} + +/** + * read_wbuf - bulk-read from a LEB with a wbuf. + * @wbuf: wbuf that may overlap the read + * @buf: buffer into which to read + * @len: read length + * @lnum: LEB number from which to read + * @offs: offset from which to read + * + * This functions returns %0 on success or a negative error code on failure. + */ +static int read_wbuf(struct ubifs_wbuf *wbuf, void *buf, int len, int lnum, + int offs) +{ + const struct ubifs_info *c = wbuf->c; + int rlen, overlap; + + dbg_io("LEB %d:%d, length %d", lnum, offs, len); + ubifs_assert(wbuf && lnum >= 0 && lnum < c->leb_cnt && offs >= 0); + ubifs_assert(!(offs & 7) && offs < c->leb_size); + ubifs_assert(offs + len <= c->leb_size); + + spin_lock(&wbuf->lock); + overlap = (lnum == wbuf->lnum && offs + len > wbuf->offs); + if (!overlap) { + /* We may safely unlock the write-buffer and read the data */ + spin_unlock(&wbuf->lock); + return ubi_read(c->ubi, lnum, buf, offs, len); + } + + /* Don't read under wbuf */ + rlen = wbuf->offs - offs; + if (rlen < 0) + rlen = 0; + + /* Copy the rest from the write-buffer */ + memcpy(buf + rlen, wbuf->buf + offs + rlen - wbuf->offs, len - rlen); + spin_unlock(&wbuf->lock); + + if (rlen > 0) + /* Read everything that goes before write-buffer */ + return ubi_read(c->ubi, lnum, buf, offs, rlen); + + return 0; +} + +/** + * validate_data_node - validate data nodes for bulk-read. + * @c: UBIFS file-system description object + * @buf: buffer containing data node to validate + * @zbr: zbranch of data node to validate + * + * This functions returns %0 on success or a negative error code on failure. + */ +static int validate_data_node(struct ubifs_info *c, void *buf, + struct ubifs_zbranch *zbr) +{ + union ubifs_key key1; + struct ubifs_ch *ch = buf; + int err, len; + + if (ch->node_type != UBIFS_DATA_NODE) { + ubifs_err("bad node type (%d but expected %d)", + ch->node_type, UBIFS_DATA_NODE); + goto out_err; + } + + err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0, 0); + if (err) { + ubifs_err("expected node type %d", UBIFS_DATA_NODE); + goto out; + } + + len = le32_to_cpu(ch->len); + if (len != zbr->len) { + ubifs_err("bad node length %d, expected %d", len, zbr->len); + goto out_err; + } + + /* Make sure the key of the read node is correct */ + key_read(c, buf + UBIFS_KEY_OFFSET, &key1); + if (!keys_eq(c, &zbr->key, &key1)) { + ubifs_err("bad key in node at LEB %d:%d", + zbr->lnum, zbr->offs); + dbg_tnc("looked for key %s found node's key %s", + DBGKEY(&zbr->key), DBGKEY1(&key1)); + goto out_err; + } + + return 0; + +out_err: + err = -EINVAL; +out: + ubifs_err("bad node at LEB %d:%d", zbr->lnum, zbr->offs); + dbg_dump_node(c, buf); + dbg_dump_stack(); + return err; +} + +/** + * ubifs_tnc_bulk_read - read a number of data nodes in one go. + * @c: UBIFS file-system description object + * @bu: bulk-read parameters and results + * + * This functions reads and validates the data nodes that were identified by the + * 'ubifs_tnc_get_bu_keys()' function. This functions returns %0 on success, + * -EAGAIN to indicate a race with GC, or another negative error code on + * failure. + */ +int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu) +{ + int lnum = bu->zbranch[0].lnum, offs = bu->zbranch[0].offs, len, err, i; + struct ubifs_wbuf *wbuf; + void *buf; + + len = bu->zbranch[bu->cnt - 1].offs; + len += bu->zbranch[bu->cnt - 1].len - offs; + if (len > bu->buf_len) { + ubifs_err("buffer too small %d vs %d", bu->buf_len, len); + return -EINVAL; + } + + /* Do the read */ + wbuf = ubifs_get_wbuf(c, lnum); + if (wbuf) + err = read_wbuf(wbuf, bu->buf, len, lnum, offs); + else + err = ubi_read(c->ubi, lnum, bu->buf, offs, len); + + /* Check for a race with GC */ + if (maybe_leb_gced(c, lnum, bu->gc_seq)) + return -EAGAIN; + + if (err && err != -EBADMSG) { + ubifs_err("failed to read from LEB %d:%d, error %d", + lnum, offs, err); + dbg_dump_stack(); + dbg_tnc("key %s", DBGKEY(&bu->key)); + return err; + } + + /* Validate the nodes read */ + buf = bu->buf; + for (i = 0; i < bu->cnt; i++) { + err = validate_data_node(c, buf, &bu->zbranch[i]); + if (err) + return err; + buf = buf + ALIGN(bu->zbranch[i].len, 8); + } + + return 0; +} + +/** + * do_lookup_nm- look up a "hashed" node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * @nm: node name + * + * This function look up and reads a node which contains name hash in the key. + * Since the hash may have collisions, there may be many nodes with the same + * key, so we have to sequentially look to all of them until the needed one is + * found. This function returns zero in case of success, %-ENOENT if the node + * was not found, and a negative error code in case of failure. + */ +static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, + void *node, const struct qstr *nm) +{ + int found, n, err; + struct ubifs_znode *znode; + + dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key)); + mutex_lock(&c->tnc_mutex); + found = ubifs_lookup_level0(c, key, &znode, &n); + if (!found) { + err = -ENOENT; + goto out_unlock; + } else if (found < 0) { + err = found; + goto out_unlock; + } + + ubifs_assert(n >= 0); + + err = resolve_collision(c, key, &znode, &n, nm); + dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n); + if (unlikely(err < 0)) + goto out_unlock; + if (err == 0) { + err = -ENOENT; + goto out_unlock; + } + + err = tnc_read_node_nm(c, &znode->zbranch[n], node); + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_lookup_nm - look up a "hashed" node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * @nm: node name + * + * This function look up and reads a node which contains name hash in the key. + * Since the hash may have collisions, there may be many nodes with the same + * key, so we have to sequentially look to all of them until the needed one is + * found. This function returns zero in case of success, %-ENOENT if the node + * was not found, and a negative error code in case of failure. + */ +int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, + void *node, const struct qstr *nm) +{ + int err, len; + const struct ubifs_dent_node *dent = node; + + /* + * We assume that in most of the cases there are no name collisions and + * 'ubifs_tnc_lookup()' returns us the right direntry. + */ + err = ubifs_tnc_lookup(c, key, node); + if (err) + return err; + + len = le16_to_cpu(dent->nlen); + if (nm->len == len && !memcmp(dent->name, nm->name, len)) + return 0; + + /* + * Unluckily, there are hash collisions and we have to iterate over + * them look at each direntry with colliding name hash sequentially. + */ + return do_lookup_nm(c, key, node, nm); +} + +/** + * correct_parent_keys - correct parent znodes' keys. + * @c: UBIFS file-system description object + * @znode: znode to correct parent znodes for + * + * This is a helper function for 'tnc_insert()'. When the key of the leftmost + * zbranch changes, keys of parent znodes have to be corrected. This helper + * function is called in such situations and corrects the keys if needed. + */ +static void correct_parent_keys(const struct ubifs_info *c, + struct ubifs_znode *znode) +{ + union ubifs_key *key, *key1; + + ubifs_assert(znode->parent); + ubifs_assert(znode->iip == 0); + + key = &znode->zbranch[0].key; + key1 = &znode->parent->zbranch[0].key; + + while (keys_cmp(c, key, key1) < 0) { + key_copy(c, key, key1); + znode = znode->parent; + znode->alt = 1; + if (!znode->parent || znode->iip) + break; + key1 = &znode->parent->zbranch[0].key; + } +} + +/** + * insert_zbranch - insert a zbranch into a znode. + * @znode: znode into which to insert + * @zbr: zbranch to insert + * @n: slot number to insert to + * + * This is a helper function for 'tnc_insert()'. UBIFS does not allow "gaps" in + * znode's array of zbranches and keeps zbranches consolidated, so when a new + * zbranch has to be inserted to the @znode->zbranches[]' array at the @n-th + * slot, zbranches starting from @n have to be moved right. + */ +static void insert_zbranch(struct ubifs_znode *znode, + const struct ubifs_zbranch *zbr, int n) +{ + int i; + + ubifs_assert(ubifs_zn_dirty(znode)); + + if (znode->level) { + for (i = znode->child_cnt; i > n; i--) { + znode->zbranch[i] = znode->zbranch[i - 1]; + if (znode->zbranch[i].znode) + znode->zbranch[i].znode->iip = i; + } + if (zbr->znode) + zbr->znode->iip = n; + } else + for (i = znode->child_cnt; i > n; i--) + znode->zbranch[i] = znode->zbranch[i - 1]; + + znode->zbranch[n] = *zbr; + znode->child_cnt += 1; + + /* + * After inserting at slot zero, the lower bound of the key range of + * this znode may have changed. If this znode is subsequently split + * then the upper bound of the key range may change, and furthermore + * it could change to be lower than the original lower bound. If that + * happens, then it will no longer be possible to find this znode in the + * TNC using the key from the index node on flash. That is bad because + * if it is not found, we will assume it is obsolete and may overwrite + * it. Then if there is an unclean unmount, we will start using the + * old index which will be broken. + * + * So we first mark znodes that have insertions at slot zero, and then + * if they are split we add their lnum/offs to the old_idx tree. + */ + if (n == 0) + znode->alt = 1; +} + +/** + * tnc_insert - insert a node into TNC. + * @c: UBIFS file-system description object + * @znode: znode to insert into + * @zbr: branch to insert + * @n: slot number to insert new zbranch to + * + * This function inserts a new node described by @zbr into znode @znode. If + * znode does not have a free slot for new zbranch, it is split. Parent znodes + * are splat as well if needed. Returns zero in case of success or a negative + * error code in case of failure. + */ +static int tnc_insert(struct ubifs_info *c, struct ubifs_znode *znode, + struct ubifs_zbranch *zbr, int n) +{ + struct ubifs_znode *zn, *zi, *zp; + int i, keep, move, appending = 0; + union ubifs_key *key = &zbr->key, *key1; + + ubifs_assert(n >= 0 && n <= c->fanout); + + /* Implement naive insert for now */ +again: + zp = znode->parent; + if (znode->child_cnt < c->fanout) { + ubifs_assert(n != c->fanout); + dbg_tnc("inserted at %d level %d, key %s", n, znode->level, + DBGKEY(key)); + + insert_zbranch(znode, zbr, n); + + /* Ensure parent's key is correct */ + if (n == 0 && zp && znode->iip == 0) + correct_parent_keys(c, znode); + + return 0; + } + + /* + * Unfortunately, @znode does not have more empty slots and we have to + * split it. + */ + dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key)); + + if (znode->alt) + /* + * We can no longer be sure of finding this znode by key, so we + * record it in the old_idx tree. + */ + ins_clr_old_idx_znode(c, znode); + + zn = kzalloc(c->max_znode_sz, GFP_NOFS); + if (!zn) + return -ENOMEM; + zn->parent = zp; + zn->level = znode->level; + + /* Decide where to split */ + if (znode->level == 0 && key_type(c, key) == UBIFS_DATA_KEY) { + /* Try not to split consecutive data keys */ + if (n == c->fanout) { + key1 = &znode->zbranch[n - 1].key; + if (key_inum(c, key1) == key_inum(c, key) && + key_type(c, key1) == UBIFS_DATA_KEY) + appending = 1; + } else + goto check_split; + } else if (appending && n != c->fanout) { + /* Try not to split consecutive data keys */ + appending = 0; +check_split: + if (n >= (c->fanout + 1) / 2) { + key1 = &znode->zbranch[0].key; + if (key_inum(c, key1) == key_inum(c, key) && + key_type(c, key1) == UBIFS_DATA_KEY) { + key1 = &znode->zbranch[n].key; + if (key_inum(c, key1) != key_inum(c, key) || + key_type(c, key1) != UBIFS_DATA_KEY) { + keep = n; + move = c->fanout - keep; + zi = znode; + goto do_split; + } + } + } + } + + if (appending) { + keep = c->fanout; + move = 0; + } else { + keep = (c->fanout + 1) / 2; + move = c->fanout - keep; + } + + /* + * Although we don't at present, we could look at the neighbors and see + * if we can move some zbranches there. + */ + + if (n < keep) { + /* Insert into existing znode */ + zi = znode; + move += 1; + keep -= 1; + } else { + /* Insert into new znode */ + zi = zn; + n -= keep; + /* Re-parent */ + if (zn->level != 0) + zbr->znode->parent = zn; + } + +do_split: + + __set_bit(DIRTY_ZNODE, &zn->flags); + atomic_long_inc(&c->dirty_zn_cnt); + + zn->child_cnt = move; + znode->child_cnt = keep; + + dbg_tnc("moving %d, keeping %d", move, keep); + + /* Move zbranch */ + for (i = 0; i < move; i++) { + zn->zbranch[i] = znode->zbranch[keep + i]; + /* Re-parent */ + if (zn->level != 0) + if (zn->zbranch[i].znode) { + zn->zbranch[i].znode->parent = zn; + zn->zbranch[i].znode->iip = i; + } + } + + /* Insert new key and branch */ + dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key)); + + insert_zbranch(zi, zbr, n); + + /* Insert new znode (produced by spitting) into the parent */ + if (zp) { + if (n == 0 && zi == znode && znode->iip == 0) + correct_parent_keys(c, znode); + + /* Locate insertion point */ + n = znode->iip + 1; + + /* Tail recursion */ + zbr->key = zn->zbranch[0].key; + zbr->znode = zn; + zbr->lnum = 0; + zbr->offs = 0; + zbr->len = 0; + znode = zp; + + goto again; + } + + /* We have to split root znode */ + dbg_tnc("creating new zroot at level %d", znode->level + 1); + + zi = kzalloc(c->max_znode_sz, GFP_NOFS); + if (!zi) + return -ENOMEM; + + zi->child_cnt = 2; + zi->level = znode->level + 1; + + __set_bit(DIRTY_ZNODE, &zi->flags); + atomic_long_inc(&c->dirty_zn_cnt); + + zi->zbranch[0].key = znode->zbranch[0].key; + zi->zbranch[0].znode = znode; + zi->zbranch[0].lnum = c->zroot.lnum; + zi->zbranch[0].offs = c->zroot.offs; + zi->zbranch[0].len = c->zroot.len; + zi->zbranch[1].key = zn->zbranch[0].key; + zi->zbranch[1].znode = zn; + + c->zroot.lnum = 0; + c->zroot.offs = 0; + c->zroot.len = 0; + c->zroot.znode = zi; + + zn->parent = zi; + zn->iip = 1; + znode->parent = zi; + znode->iip = 0; + + return 0; +} + +/** + * ubifs_tnc_add - add a node to TNC. + * @c: UBIFS file-system description object + * @key: key to add + * @lnum: LEB number of node + * @offs: node offset + * @len: node length + * + * This function adds a node with key @key to TNC. The node may be new or it may + * obsolete some existing one. Returns %0 on success or negative error code on + * failure. + */ +int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, + int offs, int len) +{ + int found, n, err = 0; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key)); + found = lookup_level0_dirty(c, key, &znode, &n); + if (!found) { + struct ubifs_zbranch zbr; + + zbr.znode = NULL; + zbr.lnum = lnum; + zbr.offs = offs; + zbr.len = len; + key_copy(c, key, &zbr.key); + err = tnc_insert(c, znode, &zbr, n + 1); + } else if (found == 1) { + struct ubifs_zbranch *zbr = &znode->zbranch[n]; + + lnc_free(zbr); + err = ubifs_add_dirt(c, zbr->lnum, zbr->len); + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + } else + err = found; + if (!err) + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + + return err; +} + +/** + * ubifs_tnc_replace - replace a node in the TNC only if the old node is found. + * @c: UBIFS file-system description object + * @key: key to add + * @old_lnum: LEB number of old node + * @old_offs: old node offset + * @lnum: LEB number of node + * @offs: node offset + * @len: node length + * + * This function replaces a node with key @key in the TNC only if the old node + * is found. This function is called by garbage collection when node are moved. + * Returns %0 on success or negative error code on failure. + */ +int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key, + int old_lnum, int old_offs, int lnum, int offs, int len) +{ + int found, n, err = 0; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum, + old_offs, lnum, offs, len, DBGKEY(key)); + found = lookup_level0_dirty(c, key, &znode, &n); + if (found < 0) { + err = found; + goto out_unlock; + } + + if (found == 1) { + struct ubifs_zbranch *zbr = &znode->zbranch[n]; + + found = 0; + if (zbr->lnum == old_lnum && zbr->offs == old_offs) { + lnc_free(zbr); + err = ubifs_add_dirt(c, zbr->lnum, zbr->len); + if (err) + goto out_unlock; + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + found = 1; + } else if (is_hash_key(c, key)) { + found = resolve_collision_directly(c, key, &znode, &n, + old_lnum, old_offs); + dbg_tnc("rc returned %d, znode %p, n %d, LEB %d:%d", + found, znode, n, old_lnum, old_offs); + if (found < 0) { + err = found; + goto out_unlock; + } + + if (found) { + /* Ensure the znode is dirtied */ + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + } + zbr = &znode->zbranch[n]; + lnc_free(zbr); + err = ubifs_add_dirt(c, zbr->lnum, + zbr->len); + if (err) + goto out_unlock; + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + } + } + } + + if (!found) + err = ubifs_add_dirt(c, lnum, len); + + if (!err) + err = dbg_check_tnc(c, 0); + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_add_nm - add a "hashed" node to TNC. + * @c: UBIFS file-system description object + * @key: key to add + * @lnum: LEB number of node + * @offs: node offset + * @len: node length + * @nm: node name + * + * This is the same as 'ubifs_tnc_add()' but it should be used with keys which + * may have collisions, like directory entry keys. + */ +int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, + int lnum, int offs, int len, const struct qstr *nm) +{ + int found, n, err = 0; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name, + DBGKEY(key)); + found = lookup_level0_dirty(c, key, &znode, &n); + if (found < 0) { + err = found; + goto out_unlock; + } + + if (found == 1) { + if (c->replaying) + found = fallible_resolve_collision(c, key, &znode, &n, + nm, 1); + else + found = resolve_collision(c, key, &znode, &n, nm); + dbg_tnc("rc returned %d, znode %p, n %d", found, znode, n); + if (found < 0) { + err = found; + goto out_unlock; + } + + /* Ensure the znode is dirtied */ + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + } + + if (found == 1) { + struct ubifs_zbranch *zbr = &znode->zbranch[n]; + + lnc_free(zbr); + err = ubifs_add_dirt(c, zbr->lnum, zbr->len); + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + goto out_unlock; + } + } + + if (!found) { + struct ubifs_zbranch zbr; + + zbr.znode = NULL; + zbr.lnum = lnum; + zbr.offs = offs; + zbr.len = len; + key_copy(c, key, &zbr.key); + err = tnc_insert(c, znode, &zbr, n + 1); + if (err) + goto out_unlock; + if (c->replaying) { + /* + * We did not find it in the index so there may be a + * dangling branch still in the index. So we remove it + * by passing 'ubifs_tnc_remove_nm()' the same key but + * an unmatchable name. + */ + struct qstr noname = { .len = 0, .name = "" }; + + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + if (err) + return err; + return ubifs_tnc_remove_nm(c, key, &noname); + } + } + +out_unlock: + if (!err) + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * tnc_delete - delete a znode form TNC. + * @c: UBIFS file-system description object + * @znode: znode to delete from + * @n: zbranch slot number to delete + * + * This function deletes a leaf node from @n-th slot of @znode. Returns zero in + * case of success and a negative error code in case of failure. + */ +static int tnc_delete(struct ubifs_info *c, struct ubifs_znode *znode, int n) +{ + struct ubifs_zbranch *zbr; + struct ubifs_znode *zp; + int i, err; + + /* Delete without merge for now */ + ubifs_assert(znode->level == 0); + ubifs_assert(n >= 0 && n < c->fanout); + dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key)); + + zbr = &znode->zbranch[n]; + lnc_free(zbr); + + err = ubifs_add_dirt(c, zbr->lnum, zbr->len); + if (err) { + dbg_dump_znode(c, znode); + return err; + } + + /* We do not "gap" zbranch slots */ + for (i = n; i < znode->child_cnt - 1; i++) + znode->zbranch[i] = znode->zbranch[i + 1]; + znode->child_cnt -= 1; + + if (znode->child_cnt > 0) + return 0; + + /* + * This was the last zbranch, we have to delete this znode from the + * parent. + */ + + do { + ubifs_assert(!test_bit(OBSOLETE_ZNODE, &znode->flags)); + ubifs_assert(ubifs_zn_dirty(znode)); + + zp = znode->parent; + n = znode->iip; + + atomic_long_dec(&c->dirty_zn_cnt); + + err = insert_old_idx_znode(c, znode); + if (err) + return err; + + if (znode->cnext) { + __set_bit(OBSOLETE_ZNODE, &znode->flags); + atomic_long_inc(&c->clean_zn_cnt); + atomic_long_inc(&ubifs_clean_zn_cnt); + } else + kfree(znode); + znode = zp; + } while (znode->child_cnt == 1); /* while removing last child */ + + /* Remove from znode, entry n - 1 */ + znode->child_cnt -= 1; + ubifs_assert(znode->level != 0); + for (i = n; i < znode->child_cnt; i++) { + znode->zbranch[i] = znode->zbranch[i + 1]; + if (znode->zbranch[i].znode) + znode->zbranch[i].znode->iip = i; + } + + /* + * If this is the root and it has only 1 child then + * collapse the tree. + */ + if (!znode->parent) { + while (znode->child_cnt == 1 && znode->level != 0) { + zp = znode; + zbr = &znode->zbranch[0]; + znode = get_znode(c, znode, 0); + if (IS_ERR(znode)) + return PTR_ERR(znode); + znode = dirty_cow_znode(c, zbr); + if (IS_ERR(znode)) + return PTR_ERR(znode); + znode->parent = NULL; + znode->iip = 0; + if (c->zroot.len) { + err = insert_old_idx(c, c->zroot.lnum, + c->zroot.offs); + if (err) + return err; + } + c->zroot.lnum = zbr->lnum; + c->zroot.offs = zbr->offs; + c->zroot.len = zbr->len; + c->zroot.znode = znode; + ubifs_assert(!test_bit(OBSOLETE_ZNODE, + &zp->flags)); + ubifs_assert(test_bit(DIRTY_ZNODE, &zp->flags)); + atomic_long_dec(&c->dirty_zn_cnt); + + if (zp->cnext) { + __set_bit(OBSOLETE_ZNODE, &zp->flags); + atomic_long_inc(&c->clean_zn_cnt); + atomic_long_inc(&ubifs_clean_zn_cnt); + } else + kfree(zp); + } + } + + return 0; +} + +/** + * ubifs_tnc_remove - remove an index entry of a node. + * @c: UBIFS file-system description object + * @key: key of node + * + * Returns %0 on success or negative error code on failure. + */ +int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key) +{ + int found, n, err = 0; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnc("key %s", DBGKEY(key)); + found = lookup_level0_dirty(c, key, &znode, &n); + if (found < 0) { + err = found; + goto out_unlock; + } + if (found == 1) + err = tnc_delete(c, znode, n); + if (!err) + err = dbg_check_tnc(c, 0); + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_remove_nm - remove an index entry for a "hashed" node. + * @c: UBIFS file-system description object + * @key: key of node + * @nm: directory entry name + * + * Returns %0 on success or negative error code on failure. + */ +int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, + const struct qstr *nm) +{ + int n, err; + struct ubifs_znode *znode; + + mutex_lock(&c->tnc_mutex); + dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key)); + err = lookup_level0_dirty(c, key, &znode, &n); + if (err < 0) + goto out_unlock; + + if (err) { + if (c->replaying) + err = fallible_resolve_collision(c, key, &znode, &n, + nm, 0); + else + err = resolve_collision(c, key, &znode, &n, nm); + dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n); + if (err < 0) + goto out_unlock; + if (err) { + /* Ensure the znode is dirtied */ + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + } + err = tnc_delete(c, znode, n); + } + } + +out_unlock: + if (!err) + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * key_in_range - determine if a key falls within a range of keys. + * @c: UBIFS file-system description object + * @key: key to check + * @from_key: lowest key in range + * @to_key: highest key in range + * + * This function returns %1 if the key is in range and %0 otherwise. + */ +static int key_in_range(struct ubifs_info *c, union ubifs_key *key, + union ubifs_key *from_key, union ubifs_key *to_key) +{ + if (keys_cmp(c, key, from_key) < 0) + return 0; + if (keys_cmp(c, key, to_key) > 0) + return 0; + return 1; +} + +/** + * ubifs_tnc_remove_range - remove index entries in range. + * @c: UBIFS file-system description object + * @from_key: lowest key to remove + * @to_key: highest key to remove + * + * This function removes index entries starting at @from_key and ending at + * @to_key. This function returns zero in case of success and a negative error + * code in case of failure. + */ +int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key, + union ubifs_key *to_key) +{ + int i, n, k, err = 0; + struct ubifs_znode *znode; + union ubifs_key *key; + + mutex_lock(&c->tnc_mutex); + while (1) { + /* Find first level 0 znode that contains keys to remove */ + err = ubifs_lookup_level0(c, from_key, &znode, &n); + if (err < 0) + goto out_unlock; + + if (err) + key = from_key; + else { + err = tnc_next(c, &znode, &n); + if (err == -ENOENT) { + err = 0; + goto out_unlock; + } + if (err < 0) + goto out_unlock; + key = &znode->zbranch[n].key; + if (!key_in_range(c, key, from_key, to_key)) { + err = 0; + goto out_unlock; + } + } + + /* Ensure the znode is dirtied */ + if (znode->cnext || !ubifs_zn_dirty(znode)) { + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + } + + /* Remove all keys in range except the first */ + for (i = n + 1, k = 0; i < znode->child_cnt; i++, k++) { + key = &znode->zbranch[i].key; + if (!key_in_range(c, key, from_key, to_key)) + break; + lnc_free(&znode->zbranch[i]); + err = ubifs_add_dirt(c, znode->zbranch[i].lnum, + znode->zbranch[i].len); + if (err) { + dbg_dump_znode(c, znode); + goto out_unlock; + } + dbg_tnc("removing %s", DBGKEY(key)); + } + if (k) { + for (i = n + 1 + k; i < znode->child_cnt; i++) + znode->zbranch[i - k] = znode->zbranch[i]; + znode->child_cnt -= k; + } + + /* Now delete the first */ + err = tnc_delete(c, znode, n); + if (err) + goto out_unlock; + } + +out_unlock: + if (!err) + err = dbg_check_tnc(c, 0); + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_remove_ino - remove an inode from TNC. + * @c: UBIFS file-system description object + * @inum: inode number to remove + * + * This function remove inode @inum and all the extended attributes associated + * with the anode from TNC and returns zero in case of success or a negative + * error code in case of failure. + */ +int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) +{ + union ubifs_key key1, key2; + struct ubifs_dent_node *xent, *pxent = NULL; + struct qstr nm = { .name = NULL }; + + dbg_tnc("ino %lu", (unsigned long)inum); + + /* + * Walk all extended attribute entries and remove them together with + * corresponding extended attribute inodes. + */ + lowest_xent_key(c, &key1, inum); + while (1) { + ino_t xattr_inum; + int err; + + xent = ubifs_tnc_next_ent(c, &key1, &nm); + if (IS_ERR(xent)) { + err = PTR_ERR(xent); + if (err == -ENOENT) + break; + return err; + } + + xattr_inum = le64_to_cpu(xent->inum); + dbg_tnc("xent '%s', ino %lu", xent->name, + (unsigned long)xattr_inum); + + nm.name = xent->name; + nm.len = le16_to_cpu(xent->nlen); + err = ubifs_tnc_remove_nm(c, &key1, &nm); + if (err) { + kfree(xent); + return err; + } + + lowest_ino_key(c, &key1, xattr_inum); + highest_ino_key(c, &key2, xattr_inum); + err = ubifs_tnc_remove_range(c, &key1, &key2); + if (err) { + kfree(xent); + return err; + } + + kfree(pxent); + pxent = xent; + key_read(c, &xent->key, &key1); + } + + kfree(pxent); + lowest_ino_key(c, &key1, inum); + highest_ino_key(c, &key2, inum); + + return ubifs_tnc_remove_range(c, &key1, &key2); +} + +/** + * ubifs_tnc_next_ent - walk directory or extended attribute entries. + * @c: UBIFS file-system description object + * @key: key of last entry + * @nm: name of last entry found or %NULL + * + * This function finds and reads the next directory or extended attribute entry + * after the given key (@key) if there is one. @nm is used to resolve + * collisions. + * + * If the name of the current entry is not known and only the key is known, + * @nm->name has to be %NULL. In this case the semantics of this function is a + * little bit different and it returns the entry corresponding to this key, not + * the next one. If the key was not found, the closest "right" entry is + * returned. + * + * If the fist entry has to be found, @key has to contain the lowest possible + * key value for this inode and @name has to be %NULL. + * + * This function returns the found directory or extended attribute entry node + * in case of success, %-ENOENT is returned if no entry was found, and a + * negative error code is returned in case of failure. + */ +struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, + union ubifs_key *key, + const struct qstr *nm) +{ + int n, err, type = key_type(c, key); + struct ubifs_znode *znode; + struct ubifs_dent_node *dent; + struct ubifs_zbranch *zbr; + union ubifs_key *dkey; + + dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key)); + ubifs_assert(is_hash_key(c, key)); + + mutex_lock(&c->tnc_mutex); + err = ubifs_lookup_level0(c, key, &znode, &n); + if (unlikely(err < 0)) + goto out_unlock; + + if (nm->name) { + if (err) { + /* Handle collisions */ + err = resolve_collision(c, key, &znode, &n, nm); + dbg_tnc("rc returned %d, znode %p, n %d", + err, znode, n); + if (unlikely(err < 0)) + goto out_unlock; + } + + /* Now find next entry */ + err = tnc_next(c, &znode, &n); + if (unlikely(err)) + goto out_unlock; + } else { + /* + * The full name of the entry was not given, in which case the + * behavior of this function is a little different and it + * returns current entry, not the next one. + */ + if (!err) { + /* + * However, the given key does not exist in the TNC + * tree and @znode/@n variables contain the closest + * "preceding" element. Switch to the next one. + */ + err = tnc_next(c, &znode, &n); + if (err) + goto out_unlock; + } + } + + zbr = &znode->zbranch[n]; + dent = kmalloc(zbr->len, GFP_NOFS); + if (unlikely(!dent)) { + err = -ENOMEM; + goto out_unlock; + } + + /* + * The above 'tnc_next()' call could lead us to the next inode, check + * this. + */ + dkey = &zbr->key; + if (key_inum(c, dkey) != key_inum(c, key) || + key_type(c, dkey) != type) { + err = -ENOENT; + goto out_free; + } + + err = tnc_read_node_nm(c, zbr, dent); + if (unlikely(err)) + goto out_free; + + mutex_unlock(&c->tnc_mutex); + return dent; + +out_free: + kfree(dent); +out_unlock: + mutex_unlock(&c->tnc_mutex); + return ERR_PTR(err); +} + +/** + * tnc_destroy_cnext - destroy left-over obsolete znodes from a failed commit. + * @c: UBIFS file-system description object + * + * Destroy left-over obsolete znodes from a failed commit. + */ +static void tnc_destroy_cnext(struct ubifs_info *c) +{ + struct ubifs_znode *cnext; + + if (!c->cnext) + return; + ubifs_assert(c->cmt_state == COMMIT_BROKEN); + cnext = c->cnext; + do { + struct ubifs_znode *znode = cnext; + + cnext = cnext->cnext; + if (test_bit(OBSOLETE_ZNODE, &znode->flags)) + kfree(znode); + } while (cnext && cnext != c->cnext); +} + +/** + * ubifs_tnc_close - close TNC subsystem and free all related resources. + * @c: UBIFS file-system description object + */ +void ubifs_tnc_close(struct ubifs_info *c) +{ + long clean_freed; + + tnc_destroy_cnext(c); + if (c->zroot.znode) { + clean_freed = ubifs_destroy_tnc_subtree(c->zroot.znode); + atomic_long_sub(clean_freed, &ubifs_clean_zn_cnt); + } + kfree(c->gap_lebs); + kfree(c->ilebs); + destroy_old_idx(c); +} + +/** + * left_znode - get the znode to the left. + * @c: UBIFS file-system description object + * @znode: znode + * + * This function returns a pointer to the znode to the left of @znode or NULL if + * there is not one. A negative error code is returned on failure. + */ +static struct ubifs_znode *left_znode(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + int level = znode->level; + + while (1) { + int n = znode->iip - 1; + + /* Go up until we can go left */ + znode = znode->parent; + if (!znode) + return NULL; + if (n >= 0) { + /* Now go down the rightmost branch to 'level' */ + znode = get_znode(c, znode, n); + if (IS_ERR(znode)) + return znode; + while (znode->level != level) { + n = znode->child_cnt - 1; + znode = get_znode(c, znode, n); + if (IS_ERR(znode)) + return znode; + } + break; + } + } + return znode; +} + +/** + * right_znode - get the znode to the right. + * @c: UBIFS file-system description object + * @znode: znode + * + * This function returns a pointer to the znode to the right of @znode or NULL + * if there is not one. A negative error code is returned on failure. + */ +static struct ubifs_znode *right_znode(struct ubifs_info *c, + struct ubifs_znode *znode) +{ + int level = znode->level; + + while (1) { + int n = znode->iip + 1; + + /* Go up until we can go right */ + znode = znode->parent; + if (!znode) + return NULL; + if (n < znode->child_cnt) { + /* Now go down the leftmost branch to 'level' */ + znode = get_znode(c, znode, n); + if (IS_ERR(znode)) + return znode; + while (znode->level != level) { + znode = get_znode(c, znode, 0); + if (IS_ERR(znode)) + return znode; + } + break; + } + } + return znode; +} + +/** + * lookup_znode - find a particular indexing node from TNC. + * @c: UBIFS file-system description object + * @key: index node key to lookup + * @level: index node level + * @lnum: index node LEB number + * @offs: index node offset + * + * This function searches an indexing node by its first key @key and its + * address @lnum:@offs. It looks up the indexing tree by pulling all indexing + * nodes it traverses to TNC. This function is called fro indexing nodes which + * were found on the media by scanning, for example when garbage-collecting or + * when doing in-the-gaps commit. This means that the indexing node which is + * looked for does not have to have exactly the same leftmost key @key, because + * the leftmost key may have been changed, in which case TNC will contain a + * dirty znode which still refers the same @lnum:@offs. This function is clever + * enough to recognize such indexing nodes. + * + * Note, if a znode was deleted or changed too much, then this function will + * not find it. For situations like this UBIFS has the old index RB-tree + * (indexed by @lnum:@offs). + * + * This function returns a pointer to the znode found or %NULL if it is not + * found. A negative error code is returned on failure. + */ +static struct ubifs_znode *lookup_znode(struct ubifs_info *c, + union ubifs_key *key, int level, + int lnum, int offs) +{ + struct ubifs_znode *znode, *zn; + int n, nn; + + /* + * The arguments have probably been read off flash, so don't assume + * they are valid. + */ + if (level < 0) + return ERR_PTR(-EINVAL); + + /* Get the root znode */ + znode = c->zroot.znode; + if (!znode) { + znode = ubifs_load_znode(c, &c->zroot, NULL, 0); + if (IS_ERR(znode)) + return znode; + } + /* Check if it is the one we are looking for */ + if (c->zroot.lnum == lnum && c->zroot.offs == offs) + return znode; + /* Descend to the parent level i.e. (level + 1) */ + if (level >= znode->level) + return NULL; + while (1) { + ubifs_search_zbranch(c, znode, key, &n); + if (n < 0) { + /* + * We reached a znode where the leftmost key is greater + * than the key we are searching for. This is the same + * situation as the one described in a huge comment at + * the end of the 'ubifs_lookup_level0()' function. And + * for exactly the same reasons we have to try to look + * left before giving up. + */ + znode = left_znode(c, znode); + if (!znode) + return NULL; + if (IS_ERR(znode)) + return znode; + ubifs_search_zbranch(c, znode, key, &n); + ubifs_assert(n >= 0); + } + if (znode->level == level + 1) + break; + znode = get_znode(c, znode, n); + if (IS_ERR(znode)) + return znode; + } + /* Check if the child is the one we are looking for */ + if (znode->zbranch[n].lnum == lnum && znode->zbranch[n].offs == offs) + return get_znode(c, znode, n); + /* If the key is unique, there is nowhere else to look */ + if (!is_hash_key(c, key)) + return NULL; + /* + * The key is not unique and so may be also in the znodes to either + * side. + */ + zn = znode; + nn = n; + /* Look left */ + while (1) { + /* Move one branch to the left */ + if (n) + n -= 1; + else { + znode = left_znode(c, znode); + if (!znode) + break; + if (IS_ERR(znode)) + return znode; + n = znode->child_cnt - 1; + } + /* Check it */ + if (znode->zbranch[n].lnum == lnum && + znode->zbranch[n].offs == offs) + return get_znode(c, znode, n); + /* Stop if the key is less than the one we are looking for */ + if (keys_cmp(c, &znode->zbranch[n].key, key) < 0) + break; + } + /* Back to the middle */ + znode = zn; + n = nn; + /* Look right */ + while (1) { + /* Move one branch to the right */ + if (++n >= znode->child_cnt) { + znode = right_znode(c, znode); + if (!znode) + break; + if (IS_ERR(znode)) + return znode; + n = 0; + } + /* Check it */ + if (znode->zbranch[n].lnum == lnum && + znode->zbranch[n].offs == offs) + return get_znode(c, znode, n); + /* Stop if the key is greater than the one we are looking for */ + if (keys_cmp(c, &znode->zbranch[n].key, key) > 0) + break; + } + return NULL; +} + +/** + * is_idx_node_in_tnc - determine if an index node is in the TNC. + * @c: UBIFS file-system description object + * @key: key of index node + * @level: index node level + * @lnum: LEB number of index node + * @offs: offset of index node + * + * This function returns %0 if the index node is not referred to in the TNC, %1 + * if the index node is referred to in the TNC and the corresponding znode is + * dirty, %2 if an index node is referred to in the TNC and the corresponding + * znode is clean, and a negative error code in case of failure. + * + * Note, the @key argument has to be the key of the first child. Also note, + * this function relies on the fact that 0:0 is never a valid LEB number and + * offset for a main-area node. + */ +int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs) +{ + struct ubifs_znode *znode; + + znode = lookup_znode(c, key, level, lnum, offs); + if (!znode) + return 0; + if (IS_ERR(znode)) + return PTR_ERR(znode); + + return ubifs_zn_dirty(znode) ? 1 : 2; +} + +/** + * is_leaf_node_in_tnc - determine if a non-indexing not is in the TNC. + * @c: UBIFS file-system description object + * @key: node key + * @lnum: node LEB number + * @offs: node offset + * + * This function returns %1 if the node is referred to in the TNC, %0 if it is + * not, and a negative error code in case of failure. + * + * Note, this function relies on the fact that 0:0 is never a valid LEB number + * and offset for a main-area node. + */ +static int is_leaf_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, + int lnum, int offs) +{ + struct ubifs_zbranch *zbr; + struct ubifs_znode *znode, *zn; + int n, found, err, nn; + const int unique = !is_hash_key(c, key); + + found = ubifs_lookup_level0(c, key, &znode, &n); + if (found < 0) + return found; /* Error code */ + if (!found) + return 0; + zbr = &znode->zbranch[n]; + if (lnum == zbr->lnum && offs == zbr->offs) + return 1; /* Found it */ + if (unique) + return 0; + /* + * Because the key is not unique, we have to look left + * and right as well + */ + zn = znode; + nn = n; + /* Look left */ + while (1) { + err = tnc_prev(c, &znode, &n); + if (err == -ENOENT) + break; + if (err) + return err; + if (keys_cmp(c, key, &znode->zbranch[n].key)) + break; + zbr = &znode->zbranch[n]; + if (lnum == zbr->lnum && offs == zbr->offs) + return 1; /* Found it */ + } + /* Look right */ + znode = zn; + n = nn; + while (1) { + err = tnc_next(c, &znode, &n); + if (err) { + if (err == -ENOENT) + return 0; + return err; + } + if (keys_cmp(c, key, &znode->zbranch[n].key)) + break; + zbr = &znode->zbranch[n]; + if (lnum == zbr->lnum && offs == zbr->offs) + return 1; /* Found it */ + } + return 0; +} + +/** + * ubifs_tnc_has_node - determine whether a node is in the TNC. + * @c: UBIFS file-system description object + * @key: node key + * @level: index node level (if it is an index node) + * @lnum: node LEB number + * @offs: node offset + * @is_idx: non-zero if the node is an index node + * + * This function returns %1 if the node is in the TNC, %0 if it is not, and a + * negative error code in case of failure. For index nodes, @key has to be the + * key of the first child. An index node is considered to be in the TNC only if + * the corresponding znode is clean or has not been loaded. + */ +int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs, int is_idx) +{ + int err; + + mutex_lock(&c->tnc_mutex); + if (is_idx) { + err = is_idx_node_in_tnc(c, key, level, lnum, offs); + if (err < 0) + goto out_unlock; + if (err == 1) + /* The index node was found but it was dirty */ + err = 0; + else if (err == 2) + /* The index node was found and it was clean */ + err = 1; + else + BUG_ON(err != 0); + } else + err = is_leaf_node_in_tnc(c, key, lnum, offs); + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_dirty_idx_node - dirty an index node. + * @c: UBIFS file-system description object + * @key: index node key + * @level: index node level + * @lnum: index node LEB number + * @offs: index node offset + * + * This function loads and dirties an index node so that it can be garbage + * collected. The @key argument has to be the key of the first child. This + * function relies on the fact that 0:0 is never a valid LEB number and offset + * for a main-area node. Returns %0 on success and a negative error code on + * failure. + */ +int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs) +{ + struct ubifs_znode *znode; + int err = 0; + + mutex_lock(&c->tnc_mutex); + znode = lookup_znode(c, key, level, lnum, offs); + if (!znode) + goto out_unlock; + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + znode = dirty_cow_bottom_up(c, znode); + if (IS_ERR(znode)) { + err = PTR_ERR(znode); + goto out_unlock; + } + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} diff -Nurd linux-2.6.24/fs/ubifs/tnc_commit.c ubifs-v2.6.24/fs/ubifs/tnc_commit.c --- linux-2.6.24/fs/ubifs/tnc_commit.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/tnc_commit.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,1105 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* This file implements TNC functions for committing */ + +#include "ubifs.h" + +/** + * make_idx_node - make an index node for fill-the-gaps method of TNC commit. + * @c: UBIFS file-system description object + * @idx: buffer in which to place new index node + * @znode: znode from which to make new index node + * @lnum: LEB number where new index node will be written + * @offs: offset where new index node will be written + * @len: length of new index node + */ +static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx, + struct ubifs_znode *znode, int lnum, int offs, int len) +{ + struct ubifs_znode *zp; + int i, err; + + /* Make index node */ + idx->ch.node_type = UBIFS_IDX_NODE; + idx->child_cnt = cpu_to_le16(znode->child_cnt); + idx->level = cpu_to_le16(znode->level); + for (i = 0; i < znode->child_cnt; i++) { + struct ubifs_branch *br = ubifs_idx_branch(c, idx, i); + struct ubifs_zbranch *zbr = &znode->zbranch[i]; + + key_write_idx(c, &zbr->key, &br->key); + br->lnum = cpu_to_le32(zbr->lnum); + br->offs = cpu_to_le32(zbr->offs); + br->len = cpu_to_le32(zbr->len); + if (!zbr->lnum || !zbr->len) { + ubifs_err("bad ref in znode"); + dbg_dump_znode(c, znode); + if (zbr->znode) + dbg_dump_znode(c, zbr->znode); + } + } + ubifs_prepare_node(c, idx, len, 0); + +#ifdef CONFIG_UBIFS_FS_DEBUG + znode->lnum = lnum; + znode->offs = offs; + znode->len = len; +#endif + + err = insert_old_idx_znode(c, znode); + + /* Update the parent */ + zp = znode->parent; + if (zp) { + struct ubifs_zbranch *zbr; + + zbr = &zp->zbranch[znode->iip]; + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + } else { + c->zroot.lnum = lnum; + c->zroot.offs = offs; + c->zroot.len = len; + } + c->calc_idx_sz += ALIGN(len, 8); + + atomic_long_dec(&c->dirty_zn_cnt); + + ubifs_assert(ubifs_zn_dirty(znode)); + ubifs_assert(test_bit(COW_ZNODE, &znode->flags)); + + __clear_bit(DIRTY_ZNODE, &znode->flags); + __clear_bit(COW_ZNODE, &znode->flags); + + return err; +} + +/** + * fill_gap - make index nodes in gaps in dirty index LEBs. + * @c: UBIFS file-system description object + * @lnum: LEB number that gap appears in + * @gap_start: offset of start of gap + * @gap_end: offset of end of gap + * @dirt: adds dirty space to this + * + * This function returns the number of index nodes written into the gap. + */ +static int fill_gap(struct ubifs_info *c, int lnum, int gap_start, int gap_end, + int *dirt) +{ + int len, gap_remains, gap_pos, written, pad_len; + + ubifs_assert((gap_start & 7) == 0); + ubifs_assert((gap_end & 7) == 0); + ubifs_assert(gap_end >= gap_start); + + gap_remains = gap_end - gap_start; + if (!gap_remains) + return 0; + gap_pos = gap_start; + written = 0; + while (c->enext) { + len = ubifs_idx_node_sz(c, c->enext->child_cnt); + if (len < gap_remains) { + struct ubifs_znode *znode = c->enext; + const int alen = ALIGN(len, 8); + int err; + + ubifs_assert(alen <= gap_remains); + err = make_idx_node(c, c->ileb_buf + gap_pos, znode, + lnum, gap_pos, len); + if (err) + return err; + gap_remains -= alen; + gap_pos += alen; + c->enext = znode->cnext; + if (c->enext == c->cnext) + c->enext = NULL; + written += 1; + } else + break; + } + if (gap_end == c->leb_size) { + c->ileb_len = ALIGN(gap_pos, c->min_io_size); + /* Pad to end of min_io_size */ + pad_len = c->ileb_len - gap_pos; + } else + /* Pad to end of gap */ + pad_len = gap_remains; + dbg_gc("LEB %d:%d to %d len %d nodes written %d wasted bytes %d", + lnum, gap_start, gap_end, gap_end - gap_start, written, pad_len); + ubifs_pad(c, c->ileb_buf + gap_pos, pad_len); + *dirt += pad_len; + return written; +} + +/** + * find_old_idx - find an index node obsoleted since the last commit start. + * @c: UBIFS file-system description object + * @lnum: LEB number of obsoleted index node + * @offs: offset of obsoleted index node + * + * Returns %1 if found and %0 otherwise. + */ +static int find_old_idx(struct ubifs_info *c, int lnum, int offs) +{ + struct ubifs_old_idx *o; + struct rb_node *p; + + p = c->old_idx.rb_node; + while (p) { + o = rb_entry(p, struct ubifs_old_idx, rb); + if (lnum < o->lnum) + p = p->rb_left; + else if (lnum > o->lnum) + p = p->rb_right; + else if (offs < o->offs) + p = p->rb_left; + else if (offs > o->offs) + p = p->rb_right; + else + return 1; + } + return 0; +} + +/** + * is_idx_node_in_use - determine if an index node can be overwritten. + * @c: UBIFS file-system description object + * @key: key of index node + * @level: index node level + * @lnum: LEB number of index node + * @offs: offset of index node + * + * If @key / @lnum / @offs identify an index node that was not part of the old + * index, then this function returns %0 (obsolete). Else if the index node was + * part of the old index but is now dirty %1 is returned, else if it is clean %2 + * is returned. A negative error code is returned on failure. + */ +static int is_idx_node_in_use(struct ubifs_info *c, union ubifs_key *key, + int level, int lnum, int offs) +{ + int ret; + + ret = is_idx_node_in_tnc(c, key, level, lnum, offs); + if (ret < 0) + return ret; /* Error code */ + if (ret == 0) + if (find_old_idx(c, lnum, offs)) + return 1; + return ret; +} + +/** + * layout_leb_in_gaps - layout index nodes using in-the-gaps method. + * @c: UBIFS file-system description object + * @p: return LEB number here + * + * This function lays out new index nodes for dirty znodes using in-the-gaps + * method of TNC commit. + * This function merely puts the next znode into the next gap, making no attempt + * to try to maximise the number of znodes that fit. + * This function returns the number of index nodes written into the gaps, or a + * negative error code on failure. + */ +static int layout_leb_in_gaps(struct ubifs_info *c, int *p) +{ + struct ubifs_scan_leb *sleb; + struct ubifs_scan_node *snod; + int lnum, dirt = 0, gap_start, gap_end, err, written, tot_written; + + tot_written = 0; + /* Get an index LEB with lots of obsolete index nodes */ + lnum = ubifs_find_dirty_idx_leb(c); + if (lnum < 0) + /* + * There also may be dirt in the index head that could be + * filled, however we do not check there at present. + */ + return lnum; /* Error code */ + *p = lnum; + dbg_gc("LEB %d", lnum); + /* + * Scan the index LEB. We use the generic scan for this even though + * it is more comprehensive and less efficient than is needed for this + * purpose. + */ + sleb = ubifs_scan(c, lnum, 0, c->ileb_buf); + c->ileb_len = 0; + if (IS_ERR(sleb)) + return PTR_ERR(sleb); + gap_start = 0; + list_for_each_entry(snod, &sleb->nodes, list) { + struct ubifs_idx_node *idx; + int in_use, level; + + ubifs_assert(snod->type == UBIFS_IDX_NODE); + idx = snod->node; + key_read(c, ubifs_idx_key(c, idx), &snod->key); + level = le16_to_cpu(idx->level); + /* Determine if the index node is in use (not obsolete) */ + in_use = is_idx_node_in_use(c, &snod->key, level, lnum, + snod->offs); + if (in_use < 0) { + ubifs_scan_destroy(sleb); + return in_use; /* Error code */ + } + if (in_use) { + if (in_use == 1) + dirt += ALIGN(snod->len, 8); + /* + * The obsolete index nodes form gaps that can be + * overwritten. This gap has ended because we have + * found an index node that is still in use + * i.e. not obsolete + */ + gap_end = snod->offs; + /* Try to fill gap */ + written = fill_gap(c, lnum, gap_start, gap_end, &dirt); + if (written < 0) { + ubifs_scan_destroy(sleb); + return written; /* Error code */ + } + tot_written += written; + gap_start = ALIGN(snod->offs + snod->len, 8); + } + } + ubifs_scan_destroy(sleb); + c->ileb_len = c->leb_size; + gap_end = c->leb_size; + /* Try to fill gap */ + written = fill_gap(c, lnum, gap_start, gap_end, &dirt); + if (written < 0) + return written; /* Error code */ + tot_written += written; + if (tot_written == 0) { + struct ubifs_lprops lp; + + dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written); + err = ubifs_read_one_lp(c, lnum, &lp); + if (err) + return err; + if (lp.free == c->leb_size) { + /* + * We must have snatched this LEB from the idx_gc list + * so we need to correct the free and dirty space. + */ + err = ubifs_change_one_lp(c, lnum, + c->leb_size - c->ileb_len, + dirt, 0, 0, 0); + if (err) + return err; + } + return 0; + } + err = ubifs_change_one_lp(c, lnum, c->leb_size - c->ileb_len, dirt, + 0, 0, 0); + if (err) + return err; + err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len, + UBI_SHORTTERM); + if (err) + return err; + dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written); + return tot_written; +} + +/** + * get_leb_cnt - calculate the number of empty LEBs needed to commit. + * @c: UBIFS file-system description object + * @cnt: number of znodes to commit + * + * This function returns the number of empty LEBs needed to commit @cnt znodes + * to the current index head. The number is not exact and may be more than + * needed. + */ +static int get_leb_cnt(struct ubifs_info *c, int cnt) +{ + int d; + + /* Assume maximum index node size (i.e. overestimate space needed) */ + cnt -= (c->leb_size - c->ihead_offs) / c->max_idx_node_sz; + if (cnt < 0) + cnt = 0; + d = c->leb_size / c->max_idx_node_sz; + return DIV_ROUND_UP(cnt, d); +} + +/** + * layout_in_gaps - in-the-gaps method of committing TNC. + * @c: UBIFS file-system description object + * @cnt: number of dirty znodes to commit. + * + * This function lays out new index nodes for dirty znodes using in-the-gaps + * method of TNC commit. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int layout_in_gaps(struct ubifs_info *c, int cnt) +{ + int err, leb_needed_cnt, written, *p; + + dbg_gc("%d znodes to write", cnt); + + c->gap_lebs = kmalloc(sizeof(int) * (c->lst.idx_lebs + 1), GFP_NOFS); + if (!c->gap_lebs) + return -ENOMEM; + + p = c->gap_lebs; + do { + ubifs_assert(p < c->gap_lebs + sizeof(int) * c->lst.idx_lebs); + written = layout_leb_in_gaps(c, p); + if (written < 0) { + err = written; + if (err != -ENOSPC) { + kfree(c->gap_lebs); + c->gap_lebs = NULL; + return err; + } + if (!dbg_force_in_the_gaps_enabled) { + /* + * Do not print scary warnings if the debugging + * option which forces in-the-gaps is enabled. + */ + ubifs_err("out of space"); + spin_lock(&c->space_lock); + dbg_dump_budg(c); + spin_unlock(&c->space_lock); + dbg_dump_lprops(c); + } + /* Try to commit anyway */ + err = 0; + break; + } + p++; + cnt -= written; + leb_needed_cnt = get_leb_cnt(c, cnt); + dbg_gc("%d znodes remaining, need %d LEBs, have %d", cnt, + leb_needed_cnt, c->ileb_cnt); + } while (leb_needed_cnt > c->ileb_cnt); + + *p = -1; + return 0; +} + +/** + * layout_in_empty_space - layout index nodes in empty space. + * @c: UBIFS file-system description object + * + * This function lays out new index nodes for dirty znodes using empty LEBs. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int layout_in_empty_space(struct ubifs_info *c) +{ + struct ubifs_znode *znode, *cnext, *zp; + int lnum, offs, len, next_len, buf_len, buf_offs, used, avail; + int wlen, blen, err; + + cnext = c->enext; + if (!cnext) + return 0; + + lnum = c->ihead_lnum; + buf_offs = c->ihead_offs; + + buf_len = ubifs_idx_node_sz(c, c->fanout); + buf_len = ALIGN(buf_len, c->min_io_size); + used = 0; + avail = buf_len; + + /* Ensure there is enough room for first write */ + next_len = ubifs_idx_node_sz(c, cnext->child_cnt); + if (buf_offs + next_len > c->leb_size) + lnum = -1; + + while (1) { + znode = cnext; + + len = ubifs_idx_node_sz(c, znode->child_cnt); + + /* Determine the index node position */ + if (lnum == -1) { + if (c->ileb_nxt >= c->ileb_cnt) { + ubifs_err("out of space"); + return -ENOSPC; + } + lnum = c->ilebs[c->ileb_nxt++]; + buf_offs = 0; + used = 0; + avail = buf_len; + } + + offs = buf_offs + used; + +#ifdef CONFIG_UBIFS_FS_DEBUG + znode->lnum = lnum; + znode->offs = offs; + znode->len = len; +#endif + + /* Update the parent */ + zp = znode->parent; + if (zp) { + struct ubifs_zbranch *zbr; + int i; + + i = znode->iip; + zbr = &zp->zbranch[i]; + zbr->lnum = lnum; + zbr->offs = offs; + zbr->len = len; + } else { + c->zroot.lnum = lnum; + c->zroot.offs = offs; + c->zroot.len = len; + } + c->calc_idx_sz += ALIGN(len, 8); + + /* + * Once lprops is updated, we can decrease the dirty znode count + * but it is easier to just do it here. + */ + atomic_long_dec(&c->dirty_zn_cnt); + + /* + * Calculate the next index node length to see if there is + * enough room for it + */ + cnext = znode->cnext; + if (cnext == c->cnext) + next_len = 0; + else + next_len = ubifs_idx_node_sz(c, cnext->child_cnt); + + if (c->min_io_size == 1) { + buf_offs += ALIGN(len, 8); + if (next_len) { + if (buf_offs + next_len <= c->leb_size) + continue; + err = ubifs_update_one_lp(c, lnum, 0, + c->leb_size - buf_offs, 0, 0); + if (err) + return err; + lnum = -1; + continue; + } + err = ubifs_update_one_lp(c, lnum, + c->leb_size - buf_offs, 0, 0, 0); + if (err) + return err; + break; + } + + /* Update buffer positions */ + wlen = used + len; + used += ALIGN(len, 8); + avail -= ALIGN(len, 8); + + if (next_len != 0 && + buf_offs + used + next_len <= c->leb_size && + avail > 0) + continue; + + if (avail <= 0 && next_len && + buf_offs + used + next_len <= c->leb_size) + blen = buf_len; + else + blen = ALIGN(wlen, c->min_io_size); + + /* The buffer is full or there are no more znodes to do */ + buf_offs += blen; + if (next_len) { + if (buf_offs + next_len > c->leb_size) { + err = ubifs_update_one_lp(c, lnum, + c->leb_size - buf_offs, blen - used, + 0, 0); + if (err) + return err; + lnum = -1; + } + used -= blen; + if (used < 0) + used = 0; + avail = buf_len - used; + continue; + } + err = ubifs_update_one_lp(c, lnum, c->leb_size - buf_offs, + blen - used, 0, 0); + if (err) + return err; + break; + } + +#ifdef CONFIG_UBIFS_FS_DEBUG + c->dbg->new_ihead_lnum = lnum; + c->dbg->new_ihead_offs = buf_offs; +#endif + + return 0; +} + +/** + * layout_commit - determine positions of index nodes to commit. + * @c: UBIFS file-system description object + * @no_space: indicates that insufficient empty LEBs were allocated + * @cnt: number of znodes to commit + * + * Calculate and update the positions of index nodes to commit. If there were + * an insufficient number of empty LEBs allocated, then index nodes are placed + * into the gaps created by obsolete index nodes in non-empty index LEBs. For + * this purpose, an obsolete index node is one that was not in the index as at + * the end of the last commit. To write "in-the-gaps" requires that those index + * LEBs are updated atomically in-place. + */ +static int layout_commit(struct ubifs_info *c, int no_space, int cnt) +{ + int err; + + if (no_space) { + err = layout_in_gaps(c, cnt); + if (err) + return err; + } + err = layout_in_empty_space(c); + return err; +} + +/** + * find_first_dirty - find first dirty znode. + * @znode: znode to begin searching from + */ +static struct ubifs_znode *find_first_dirty(struct ubifs_znode *znode) +{ + int i, cont; + + if (!znode) + return NULL; + + while (1) { + if (znode->level == 0) { + if (ubifs_zn_dirty(znode)) + return znode; + return NULL; + } + cont = 0; + for (i = 0; i < znode->child_cnt; i++) { + struct ubifs_zbranch *zbr = &znode->zbranch[i]; + + if (zbr->znode && ubifs_zn_dirty(zbr->znode)) { + znode = zbr->znode; + cont = 1; + break; + } + } + if (!cont) { + if (ubifs_zn_dirty(znode)) + return znode; + return NULL; + } + } +} + +/** + * find_next_dirty - find next dirty znode. + * @znode: znode to begin searching from + */ +static struct ubifs_znode *find_next_dirty(struct ubifs_znode *znode) +{ + int n = znode->iip + 1; + + znode = znode->parent; + if (!znode) + return NULL; + for (; n < znode->child_cnt; n++) { + struct ubifs_zbranch *zbr = &znode->zbranch[n]; + + if (zbr->znode && ubifs_zn_dirty(zbr->znode)) + return find_first_dirty(zbr->znode); + } + return znode; +} + +/** + * get_znodes_to_commit - create list of dirty znodes to commit. + * @c: UBIFS file-system description object + * + * This function returns the number of znodes to commit. + */ +static int get_znodes_to_commit(struct ubifs_info *c) +{ + struct ubifs_znode *znode, *cnext; + int cnt = 0; + + c->cnext = find_first_dirty(c->zroot.znode); + znode = c->enext = c->cnext; + if (!znode) { + dbg_cmt("no znodes to commit"); + return 0; + } + cnt += 1; + while (1) { + ubifs_assert(!test_bit(COW_ZNODE, &znode->flags)); + __set_bit(COW_ZNODE, &znode->flags); + znode->alt = 0; + cnext = find_next_dirty(znode); + if (!cnext) { + znode->cnext = c->cnext; + break; + } + znode->cnext = cnext; + znode = cnext; + cnt += 1; + } + dbg_cmt("committing %d znodes", cnt); + ubifs_assert(cnt == atomic_long_read(&c->dirty_zn_cnt)); + return cnt; +} + +/** + * alloc_idx_lebs - allocate empty LEBs to be used to commit. + * @c: UBIFS file-system description object + * @cnt: number of znodes to commit + * + * This function returns %-ENOSPC if it cannot allocate a sufficient number of + * empty LEBs. %0 is returned on success, otherwise a negative error code + * is returned. + */ +static int alloc_idx_lebs(struct ubifs_info *c, int cnt) +{ + int i, leb_cnt, lnum; + + c->ileb_cnt = 0; + c->ileb_nxt = 0; + leb_cnt = get_leb_cnt(c, cnt); + dbg_cmt("need about %d empty LEBS for TNC commit", leb_cnt); + if (!leb_cnt) + return 0; + c->ilebs = kmalloc(leb_cnt * sizeof(int), GFP_NOFS); + if (!c->ilebs) + return -ENOMEM; + for (i = 0; i < leb_cnt; i++) { + lnum = ubifs_find_free_leb_for_idx(c); + if (lnum < 0) + return lnum; + c->ilebs[c->ileb_cnt++] = lnum; + dbg_cmt("LEB %d", lnum); + } + if (dbg_force_in_the_gaps()) + return -ENOSPC; + return 0; +} + +/** + * free_unused_idx_lebs - free unused LEBs that were allocated for the commit. + * @c: UBIFS file-system description object + * + * It is possible that we allocate more empty LEBs for the commit than we need. + * This functions frees the surplus. + * + * This function returns %0 on success and a negative error code on failure. + */ +static int free_unused_idx_lebs(struct ubifs_info *c) +{ + int i, err = 0, lnum, er; + + for (i = c->ileb_nxt; i < c->ileb_cnt; i++) { + lnum = c->ilebs[i]; + dbg_cmt("LEB %d", lnum); + er = ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, + LPROPS_INDEX | LPROPS_TAKEN, 0); + if (!err) + err = er; + } + return err; +} + +/** + * free_idx_lebs - free unused LEBs after commit end. + * @c: UBIFS file-system description object + * + * This function returns %0 on success and a negative error code on failure. + */ +static int free_idx_lebs(struct ubifs_info *c) +{ + int err; + + err = free_unused_idx_lebs(c); + kfree(c->ilebs); + c->ilebs = NULL; + return err; +} + +/** + * ubifs_tnc_start_commit - start TNC commit. + * @c: UBIFS file-system description object + * @zroot: new index root position is returned here + * + * This function prepares the list of indexing nodes to commit and lays out + * their positions on flash. If there is not enough free space it uses the + * in-gap commit method. Returns zero in case of success and a negative error + * code in case of failure. + */ +int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot) +{ + int err = 0, cnt; + + mutex_lock(&c->tnc_mutex); + err = dbg_check_tnc(c, 1); + if (err) + goto out; + cnt = get_znodes_to_commit(c); + if (cnt != 0) { + int no_space = 0; + + err = alloc_idx_lebs(c, cnt); + if (err == -ENOSPC) + no_space = 1; + else if (err) + goto out_free; + err = layout_commit(c, no_space, cnt); + if (err) + goto out_free; + ubifs_assert(atomic_long_read(&c->dirty_zn_cnt) == 0); + err = free_unused_idx_lebs(c); + if (err) + goto out; + } + destroy_old_idx(c); + memcpy(zroot, &c->zroot, sizeof(struct ubifs_zbranch)); + + err = ubifs_save_dirty_idx_lnums(c); + if (err) + goto out; + + spin_lock(&c->space_lock); + /* + * Although we have not finished committing yet, update size of the + * committed index ('c->old_idx_sz') and zero out the index growth + * budget. It is OK to do this now, because we've reserved all the + * space which is needed to commit the index, and it is save for the + * budgeting subsystem to assume the index is already committed, + * even though it is not. + */ + ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c)); + c->old_idx_sz = c->calc_idx_sz; + c->budg_uncommitted_idx = 0; + c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); + spin_unlock(&c->space_lock); + mutex_unlock(&c->tnc_mutex); + + dbg_cmt("number of index LEBs %d", c->lst.idx_lebs); + dbg_cmt("size of index %llu", c->calc_idx_sz); + return err; + +out_free: + free_idx_lebs(c); +out: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * write_index - write index nodes. + * @c: UBIFS file-system description object + * + * This function writes the index nodes whose positions were laid out in the + * layout_in_empty_space function. + */ +static int write_index(struct ubifs_info *c) +{ + struct ubifs_idx_node *idx; + struct ubifs_znode *znode, *cnext; + int i, lnum, offs, len, next_len, buf_len, buf_offs, used; + int avail, wlen, err, lnum_pos = 0; + + cnext = c->enext; + if (!cnext) + return 0; + + /* + * Always write index nodes to the index head so that index nodes and + * other types of nodes are never mixed in the same erase block. + */ + lnum = c->ihead_lnum; + buf_offs = c->ihead_offs; + + /* Allocate commit buffer */ + buf_len = ALIGN(c->max_idx_node_sz, c->min_io_size); + used = 0; + avail = buf_len; + + /* Ensure there is enough room for first write */ + next_len = ubifs_idx_node_sz(c, cnext->child_cnt); + if (buf_offs + next_len > c->leb_size) { + err = ubifs_update_one_lp(c, lnum, LPROPS_NC, 0, 0, + LPROPS_TAKEN); + if (err) + return err; + lnum = -1; + } + + while (1) { + cond_resched(); + + znode = cnext; + idx = c->cbuf + used; + + /* Make index node */ + idx->ch.node_type = UBIFS_IDX_NODE; + idx->child_cnt = cpu_to_le16(znode->child_cnt); + idx->level = cpu_to_le16(znode->level); + for (i = 0; i < znode->child_cnt; i++) { + struct ubifs_branch *br = ubifs_idx_branch(c, idx, i); + struct ubifs_zbranch *zbr = &znode->zbranch[i]; + + key_write_idx(c, &zbr->key, &br->key); + br->lnum = cpu_to_le32(zbr->lnum); + br->offs = cpu_to_le32(zbr->offs); + br->len = cpu_to_le32(zbr->len); + if (!zbr->lnum || !zbr->len) { + ubifs_err("bad ref in znode"); + dbg_dump_znode(c, znode); + if (zbr->znode) + dbg_dump_znode(c, zbr->znode); + } + } + len = ubifs_idx_node_sz(c, znode->child_cnt); + ubifs_prepare_node(c, idx, len, 0); + + /* Determine the index node position */ + if (lnum == -1) { + lnum = c->ilebs[lnum_pos++]; + buf_offs = 0; + used = 0; + avail = buf_len; + } + offs = buf_offs + used; + +#ifdef CONFIG_UBIFS_FS_DEBUG + if (lnum != znode->lnum || offs != znode->offs || + len != znode->len) { + ubifs_err("inconsistent znode posn"); + return -EINVAL; + } +#endif + + /* Grab some stuff from znode while we still can */ + cnext = znode->cnext; + + ubifs_assert(ubifs_zn_dirty(znode)); + ubifs_assert(test_bit(COW_ZNODE, &znode->flags)); + + /* + * It is important that other threads should see %DIRTY_ZNODE + * flag cleared before %COW_ZNODE. Specifically, it matters in + * the 'dirty_cow_znode()' function. This is the reason for the + * first barrier. Also, we want the bit changes to be seen to + * other threads ASAP, to avoid unnecesarry copying, which is + * the reason for the second barrier. + */ + clear_bit(DIRTY_ZNODE, &znode->flags); + smp_mb__before_clear_bit(); + clear_bit(COW_ZNODE, &znode->flags); + smp_mb__after_clear_bit(); + + /* Do not access znode from this point on */ + + /* Update buffer positions */ + wlen = used + len; + used += ALIGN(len, 8); + avail -= ALIGN(len, 8); + + /* + * Calculate the next index node length to see if there is + * enough room for it + */ + if (cnext == c->cnext) + next_len = 0; + else + next_len = ubifs_idx_node_sz(c, cnext->child_cnt); + + if (c->min_io_size == 1) { + /* + * Write the prepared index node immediately if there is + * no minimum IO size + */ + err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, + wlen, UBI_SHORTTERM); + if (err) + return err; + buf_offs += ALIGN(wlen, 8); + if (next_len) { + used = 0; + avail = buf_len; + if (buf_offs + next_len > c->leb_size) { + err = ubifs_update_one_lp(c, lnum, + LPROPS_NC, 0, 0, LPROPS_TAKEN); + if (err) + return err; + lnum = -1; + } + continue; + } + } else { + int blen, nxt_offs = buf_offs + used + next_len; + + if (next_len && nxt_offs <= c->leb_size) { + if (avail > 0) + continue; + else + blen = buf_len; + } else { + wlen = ALIGN(wlen, 8); + blen = ALIGN(wlen, c->min_io_size); + ubifs_pad(c, c->cbuf + wlen, blen - wlen); + } + /* + * The buffer is full or there are no more znodes + * to do + */ + err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, + blen, UBI_SHORTTERM); + if (err) + return err; + buf_offs += blen; + if (next_len) { + if (nxt_offs > c->leb_size) { + err = ubifs_update_one_lp(c, lnum, + LPROPS_NC, 0, 0, LPROPS_TAKEN); + if (err) + return err; + lnum = -1; + } + used -= blen; + if (used < 0) + used = 0; + avail = buf_len - used; + memmove(c->cbuf, c->cbuf + blen, used); + continue; + } + } + break; + } + +#ifdef CONFIG_UBIFS_FS_DEBUG + if (lnum != c->dbg->new_ihead_lnum || + buf_offs != c->dbg->new_ihead_offs) { + ubifs_err("inconsistent ihead"); + return -EINVAL; + } +#endif + + c->ihead_lnum = lnum; + c->ihead_offs = buf_offs; + + return 0; +} + +/** + * free_obsolete_znodes - free obsolete znodes. + * @c: UBIFS file-system description object + * + * At the end of commit end, obsolete znodes are freed. + */ +static void free_obsolete_znodes(struct ubifs_info *c) +{ + struct ubifs_znode *znode, *cnext; + + cnext = c->cnext; + do { + znode = cnext; + cnext = znode->cnext; + if (test_bit(OBSOLETE_ZNODE, &znode->flags)) + kfree(znode); + else { + znode->cnext = NULL; + atomic_long_inc(&c->clean_zn_cnt); + atomic_long_inc(&ubifs_clean_zn_cnt); + } + } while (cnext != c->cnext); +} + +/** + * return_gap_lebs - return LEBs used by the in-gap commit method. + * @c: UBIFS file-system description object + * + * This function clears the "taken" flag for the LEBs which were used by the + * "commit in-the-gaps" method. + */ +static int return_gap_lebs(struct ubifs_info *c) +{ + int *p, err; + + if (!c->gap_lebs) + return 0; + + dbg_cmt(""); + for (p = c->gap_lebs; *p != -1; p++) { + err = ubifs_change_one_lp(c, *p, LPROPS_NC, LPROPS_NC, 0, + LPROPS_TAKEN, 0); + if (err) + return err; + } + + kfree(c->gap_lebs); + c->gap_lebs = NULL; + return 0; +} + +/** + * ubifs_tnc_end_commit - update the TNC for commit end. + * @c: UBIFS file-system description object + * + * Write the dirty znodes. + */ +int ubifs_tnc_end_commit(struct ubifs_info *c) +{ + int err; + + if (!c->cnext) + return 0; + + err = return_gap_lebs(c); + if (err) + return err; + + err = write_index(c); + if (err) + return err; + + mutex_lock(&c->tnc_mutex); + + dbg_cmt("TNC height is %d", c->zroot.znode->level + 1); + + free_obsolete_znodes(c); + + c->cnext = NULL; + kfree(c->ilebs); + c->ilebs = NULL; + + mutex_unlock(&c->tnc_mutex); + + return 0; +} diff -Nurd linux-2.6.24/fs/ubifs/tnc_misc.c ubifs-v2.6.24/fs/ubifs/tnc_misc.c --- linux-2.6.24/fs/ubifs/tnc_misc.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/tnc_misc.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,494 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Adrian Hunter + * Artem Bityutskiy (Битюцкий Артём) + */ + +/* + * This file contains miscelanious TNC-related functions shared betweend + * different files. This file does not form any logically separate TNC + * sub-system. The file was created because there is a lot of TNC code and + * putting it all in one file would make that file too big and unreadable. + */ + +#include "ubifs.h" + +/** + * ubifs_tnc_levelorder_next - next TNC tree element in levelorder traversal. + * @zr: root of the subtree to traverse + * @znode: previous znode + * + * This function implements levelorder TNC traversal. The LNC is ignored. + * Returns the next element or %NULL if @znode is already the last one. + */ +struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr, + struct ubifs_znode *znode) +{ + int level, iip, level_search = 0; + struct ubifs_znode *zn; + + ubifs_assert(zr); + + if (unlikely(!znode)) + return zr; + + if (unlikely(znode == zr)) { + if (znode->level == 0) + return NULL; + return ubifs_tnc_find_child(zr, 0); + } + + level = znode->level; + + iip = znode->iip; + while (1) { + ubifs_assert(znode->level <= zr->level); + + /* + * First walk up until there is a znode with next branch to + * look at. + */ + while (znode->parent != zr && iip >= znode->parent->child_cnt) { + znode = znode->parent; + iip = znode->iip; + } + + if (unlikely(znode->parent == zr && + iip >= znode->parent->child_cnt)) { + /* This level is done, switch to the lower one */ + level -= 1; + if (level_search || level < 0) + /* + * We were already looking for znode at lower + * level ('level_search'). As we are here + * again, it just does not exist. Or all levels + * were finished ('level < 0'). + */ + return NULL; + + level_search = 1; + iip = -1; + znode = ubifs_tnc_find_child(zr, 0); + ubifs_assert(znode); + } + + /* Switch to the next index */ + zn = ubifs_tnc_find_child(znode->parent, iip + 1); + if (!zn) { + /* No more children to look at, we have walk up */ + iip = znode->parent->child_cnt; + continue; + } + + /* Walk back down to the level we came from ('level') */ + while (zn->level != level) { + znode = zn; + zn = ubifs_tnc_find_child(zn, 0); + if (!zn) { + /* + * This path is not too deep so it does not + * reach 'level'. Try next path. + */ + iip = znode->iip; + break; + } + } + + if (zn) { + ubifs_assert(zn->level >= 0); + return zn; + } + } +} + +/** + * ubifs_search_zbranch - search znode branch. + * @c: UBIFS file-system description object + * @znode: znode to search in + * @key: key to search for + * @n: znode branch slot number is returned here + * + * This is a helper function which search branch with key @key in @znode using + * binary search. The result of the search may be: + * o exact match, then %1 is returned, and the slot number of the branch is + * stored in @n; + * o no exact match, then %0 is returned and the slot number of the left + * closest branch is returned in @n; the slot if all keys in this znode are + * greater than @key, then %-1 is returned in @n. + */ +int ubifs_search_zbranch(const struct ubifs_info *c, + const struct ubifs_znode *znode, + const union ubifs_key *key, int *n) +{ + int beg = 0, end = znode->child_cnt, uninitialized_var(mid); + int uninitialized_var(cmp); + const struct ubifs_zbranch *zbr = &znode->zbranch[0]; + + ubifs_assert(end > beg); + + while (end > beg) { + mid = (beg + end) >> 1; + cmp = keys_cmp(c, key, &zbr[mid].key); + if (cmp > 0) + beg = mid + 1; + else if (cmp < 0) + end = mid; + else { + *n = mid; + return 1; + } + } + + *n = end - 1; + + /* The insert point is after *n */ + ubifs_assert(*n >= -1 && *n < znode->child_cnt); + if (*n == -1) + ubifs_assert(keys_cmp(c, key, &zbr[0].key) < 0); + else + ubifs_assert(keys_cmp(c, key, &zbr[*n].key) > 0); + if (*n + 1 < znode->child_cnt) + ubifs_assert(keys_cmp(c, key, &zbr[*n + 1].key) < 0); + + return 0; +} + +/** + * ubifs_tnc_postorder_first - find first znode to do postorder tree traversal. + * @znode: znode to start at (root of the sub-tree to traverse) + * + * Find the lowest leftmost znode in a subtree of the TNC tree. The LNC is + * ignored. + */ +struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode) +{ + if (unlikely(!znode)) + return NULL; + + while (znode->level > 0) { + struct ubifs_znode *child; + + child = ubifs_tnc_find_child(znode, 0); + if (!child) + return znode; + znode = child; + } + + return znode; +} + +/** + * ubifs_tnc_postorder_next - next TNC tree element in postorder traversal. + * @znode: previous znode + * + * This function implements postorder TNC traversal. The LNC is ignored. + * Returns the next element or %NULL if @znode is already the last one. + */ +struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode) +{ + struct ubifs_znode *zn; + + ubifs_assert(znode); + if (unlikely(!znode->parent)) + return NULL; + + /* Switch to the next index in the parent */ + zn = ubifs_tnc_find_child(znode->parent, znode->iip + 1); + if (!zn) + /* This is in fact the last child, return parent */ + return znode->parent; + + /* Go to the first znode in this new subtree */ + return ubifs_tnc_postorder_first(zn); +} + +/** + * ubifs_destroy_tnc_subtree - destroy all znodes connected to a subtree. + * @znode: znode defining subtree to destroy + * + * This function destroys subtree of the TNC tree. Returns number of clean + * znodes in the subtree. + */ +long ubifs_destroy_tnc_subtree(struct ubifs_znode *znode) +{ + struct ubifs_znode *zn = ubifs_tnc_postorder_first(znode); + long clean_freed = 0; + int n; + + ubifs_assert(zn); + while (1) { + for (n = 0; n < zn->child_cnt; n++) { + if (!zn->zbranch[n].znode) + continue; + + if (zn->level > 0 && + !ubifs_zn_dirty(zn->zbranch[n].znode)) + clean_freed += 1; + + cond_resched(); + kfree(zn->zbranch[n].znode); + } + + if (zn == znode) { + if (!ubifs_zn_dirty(zn)) + clean_freed += 1; + kfree(zn); + return clean_freed; + } + + zn = ubifs_tnc_postorder_next(zn); + } +} + +/** + * read_znode - read an indexing node from flash and fill znode. + * @c: UBIFS file-system description object + * @lnum: LEB of the indexing node to read + * @offs: node offset + * @len: node length + * @znode: znode to read to + * + * This function reads an indexing node from the flash media and fills znode + * with the read data. Returns zero in case of success and a negative error + * code in case of failure. The read indexing node is validated and if anything + * is wrong with it, this function prints complaint messages and returns + * %-EINVAL. + */ +static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, + struct ubifs_znode *znode) +{ + int i, err, type, cmp; + struct ubifs_idx_node *idx; + + idx = kmalloc(c->max_idx_node_sz, GFP_NOFS); + if (!idx) + return -ENOMEM; + + err = ubifs_read_node(c, idx, UBIFS_IDX_NODE, len, lnum, offs); + if (err < 0) { + kfree(idx); + return err; + } + + znode->child_cnt = le16_to_cpu(idx->child_cnt); + znode->level = le16_to_cpu(idx->level); + + dbg_tnc("LEB %d:%d, level %d, %d branch", + lnum, offs, znode->level, znode->child_cnt); + + if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) { + dbg_err("current fanout %d, branch count %d", + c->fanout, znode->child_cnt); + dbg_err("max levels %d, znode level %d", + UBIFS_MAX_LEVELS, znode->level); + err = 1; + goto out_dump; + } + + for (i = 0; i < znode->child_cnt; i++) { + const struct ubifs_branch *br = ubifs_idx_branch(c, idx, i); + struct ubifs_zbranch *zbr = &znode->zbranch[i]; + + key_read(c, &br->key, &zbr->key); + zbr->lnum = le32_to_cpu(br->lnum); + zbr->offs = le32_to_cpu(br->offs); + zbr->len = le32_to_cpu(br->len); + zbr->znode = NULL; + + /* Validate branch */ + + if (zbr->lnum < c->main_first || + zbr->lnum >= c->leb_cnt || zbr->offs < 0 || + zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) { + dbg_err("bad branch %d", i); + err = 2; + goto out_dump; + } + + switch (key_type(c, &zbr->key)) { + case UBIFS_INO_KEY: + case UBIFS_DATA_KEY: + case UBIFS_DENT_KEY: + case UBIFS_XENT_KEY: + break; + default: + dbg_msg("bad key type at slot %d: %s", i, + DBGKEY(&zbr->key)); + err = 3; + goto out_dump; + } + + if (znode->level) + continue; + + type = key_type(c, &zbr->key); + if (c->ranges[type].max_len == 0) { + if (zbr->len != c->ranges[type].len) { + dbg_err("bad target node (type %d) length (%d)", + type, zbr->len); + dbg_err("have to be %d", c->ranges[type].len); + err = 4; + goto out_dump; + } + } else if (zbr->len < c->ranges[type].min_len || + zbr->len > c->ranges[type].max_len) { + dbg_err("bad target node (type %d) length (%d)", + type, zbr->len); + dbg_err("have to be in range of %d-%d", + c->ranges[type].min_len, + c->ranges[type].max_len); + err = 5; + goto out_dump; + } + } + + /* + * Ensure that the next key is greater or equivalent to the + * previous one. + */ + for (i = 0; i < znode->child_cnt - 1; i++) { + const union ubifs_key *key1, *key2; + + key1 = &znode->zbranch[i].key; + key2 = &znode->zbranch[i + 1].key; + + cmp = keys_cmp(c, key1, key2); + if (cmp > 0) { + dbg_err("bad key order (keys %d and %d)", i, i + 1); + err = 6; + goto out_dump; + } else if (cmp == 0 && !is_hash_key(c, key1)) { + /* These can only be keys with colliding hash */ + dbg_err("keys %d and %d are not hashed but equivalent", + i, i + 1); + err = 7; + goto out_dump; + } + } + + kfree(idx); + return 0; + +out_dump: + ubifs_err("bad indexing node at LEB %d:%d, error %d", lnum, offs, err); + dbg_dump_node(c, idx); + kfree(idx); + return -EINVAL; +} + +/** + * ubifs_load_znode - load znode to TNC cache. + * @c: UBIFS file-system description object + * @zbr: znode branch + * @parent: znode's parent + * @iip: index in parent + * + * This function loads znode pointed to by @zbr into the TNC cache and + * returns pointer to it in case of success and a negative error code in case + * of failure. + */ +struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c, + struct ubifs_zbranch *zbr, + struct ubifs_znode *parent, int iip) +{ + int err; + struct ubifs_znode *znode; + + ubifs_assert(!zbr->znode); + /* + * A slab cache is not presently used for znodes because the znode size + * depends on the fanout which is stored in the superblock. + */ + znode = kzalloc(c->max_znode_sz, GFP_NOFS); + if (!znode) + return ERR_PTR(-ENOMEM); + + err = read_znode(c, zbr->lnum, zbr->offs, zbr->len, znode); + if (err) + goto out; + + atomic_long_inc(&c->clean_zn_cnt); + + /* + * Increment the global clean znode counter as well. It is OK that + * global and per-FS clean znode counters may be inconsistent for some + * short time (because we might be preempted at this point), the global + * one is only used in shrinker. + */ + atomic_long_inc(&ubifs_clean_zn_cnt); + + zbr->znode = znode; + znode->parent = parent; + znode->time = get_seconds(); + znode->iip = iip; + + return znode; + +out: + kfree(znode); + return ERR_PTR(err); +} + +/** + * ubifs_tnc_read_node - read a leaf node from the flash media. + * @c: UBIFS file-system description object + * @zbr: key and position of the node + * @node: node is returned here + * + * This function reads a node defined by @zbr from the flash media. Returns + * zero in case of success or a negative negative error code in case of + * failure. + */ +int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *node) +{ + union ubifs_key key1, *key = &zbr->key; + int err, type = key_type(c, key); + struct ubifs_wbuf *wbuf; + + /* + * 'zbr' has to point to on-flash node. The node may sit in a bud and + * may even be in a write buffer, so we have to take care about this. + */ + wbuf = ubifs_get_wbuf(c, zbr->lnum); + if (wbuf) + err = ubifs_read_node_wbuf(wbuf, node, type, zbr->len, + zbr->lnum, zbr->offs); + else + err = ubifs_read_node(c, node, type, zbr->len, zbr->lnum, + zbr->offs); + + if (err) { + dbg_tnc("key %s", DBGKEY(key)); + return err; + } + + /* Make sure the key of the read node is correct */ + key_read(c, node + UBIFS_KEY_OFFSET, &key1); + if (!keys_eq(c, key, &key1)) { + ubifs_err("bad key in node at LEB %d:%d", + zbr->lnum, zbr->offs); + dbg_tnc("looked for key %s found node's key %s", + DBGKEY(key), DBGKEY1(&key1)); + dbg_dump_node(c, node); + return -EINVAL; + } + + return 0; +} diff -Nurd linux-2.6.24/fs/ubifs/ubifs-media.h ubifs-v2.6.24/fs/ubifs/ubifs-media.h --- linux-2.6.24/fs/ubifs/ubifs-media.h 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/ubifs-media.h 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,775 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file describes UBIFS on-flash format and contains definitions of all the + * relevant data structures and constants. + * + * All UBIFS on-flash objects are stored in the form of nodes. All nodes start + * with the UBIFS node magic number and have the same common header. Nodes + * always sit at 8-byte aligned positions on the media and node header sizes are + * also 8-byte aligned (except for the indexing node and the padding node). + */ + +#ifndef __UBIFS_MEDIA_H__ +#define __UBIFS_MEDIA_H__ + +/* UBIFS node magic number (must not have the padding byte first or last) */ +#define UBIFS_NODE_MAGIC 0x06101831 + +/* + * UBIFS on-flash format version. This version is increased when the on-flash + * format is changing. If this happens, UBIFS is will support older versions as + * well. But older UBIFS code will not support newer formats. Format changes + * will be rare and only when absolutely necessary, e.g. to fix a bug or to add + * a new feature. + * + * UBIFS went into mainline kernel with format version 4. The older formats + * were development formats. + */ +#define UBIFS_FORMAT_VERSION 4 + +/* + * Read-only compatibility version. If the UBIFS format is changed, older UBIFS + * implementations will not be able to mount newer formats in read-write mode. + * However, depending on the change, it may be possible to mount newer formats + * in R/O mode. This is indicated by the R/O compatibility version which is + * stored in the super-block. + * + * This is needed to support boot-loaders which only need R/O mounting. With + * this flag it is possible to do UBIFS format changes without a need to update + * boot-loaders. + */ +#define UBIFS_RO_COMPAT_VERSION 0 + +/* Minimum logical eraseblock size in bytes */ +#define UBIFS_MIN_LEB_SZ (15*1024) + +/* Initial CRC32 value used when calculating CRC checksums */ +#define UBIFS_CRC32_INIT 0xFFFFFFFFU + +/* + * UBIFS does not try to compress data if its length is less than the below + * constant. + */ +#define UBIFS_MIN_COMPR_LEN 128 + +/* + * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes + * shorter than uncompressed data length, UBIFS prefers to leave this data + * node uncompress, because it'll be read faster. + */ +#define UBIFS_MIN_COMPRESS_DIFF 64 + +/* Root inode number */ +#define UBIFS_ROOT_INO 1 + +/* Lowest inode number used for regular inodes (not UBIFS-only internal ones) */ +#define UBIFS_FIRST_INO 64 + +/* + * Maximum file name and extended attribute length (must be a multiple of 8, + * minus 1). + */ +#define UBIFS_MAX_NLEN 255 + +/* Maximum number of data journal heads */ +#define UBIFS_MAX_JHEADS 1 + +/* + * Size of UBIFS data block. Note, UBIFS is not a block oriented file-system, + * which means that it does not treat the underlying media as consisting of + * blocks like in case of hard drives. Do not be confused. UBIFS block is just + * the maximum amount of data which one data node can have or which can be + * attached to an inode node. + */ +#define UBIFS_BLOCK_SIZE 4096 +#define UBIFS_BLOCK_SHIFT 12 + +/* UBIFS padding byte pattern (must not be first or last byte of node magic) */ +#define UBIFS_PADDING_BYTE 0xCE + +/* Maximum possible key length */ +#define UBIFS_MAX_KEY_LEN 16 + +/* Key length ("simple" format) */ +#define UBIFS_SK_LEN 8 + +/* Minimum index tree fanout */ +#define UBIFS_MIN_FANOUT 3 + +/* Maximum number of levels in UBIFS indexing B-tree */ +#define UBIFS_MAX_LEVELS 512 + +/* Maximum amount of data attached to an inode in bytes */ +#define UBIFS_MAX_INO_DATA UBIFS_BLOCK_SIZE + +/* LEB Properties Tree fanout (must be power of 2) and fanout shift */ +#define UBIFS_LPT_FANOUT 4 +#define UBIFS_LPT_FANOUT_SHIFT 2 + +/* LEB Properties Tree bit field sizes */ +#define UBIFS_LPT_CRC_BITS 16 +#define UBIFS_LPT_CRC_BYTES 2 +#define UBIFS_LPT_TYPE_BITS 4 + +/* The key is always at the same position in all keyed nodes */ +#define UBIFS_KEY_OFFSET offsetof(struct ubifs_ino_node, key) + +/* + * LEB Properties Tree node types. + * + * UBIFS_LPT_PNODE: LPT leaf node (contains LEB properties) + * UBIFS_LPT_NNODE: LPT internal node + * UBIFS_LPT_LTAB: LPT's own lprops table + * UBIFS_LPT_LSAVE: LPT's save table (big model only) + * UBIFS_LPT_NODE_CNT: count of LPT node types + * UBIFS_LPT_NOT_A_NODE: all ones (15 for 4 bits) is never a valid node type + */ +enum { + UBIFS_LPT_PNODE, + UBIFS_LPT_NNODE, + UBIFS_LPT_LTAB, + UBIFS_LPT_LSAVE, + UBIFS_LPT_NODE_CNT, + UBIFS_LPT_NOT_A_NODE = (1 << UBIFS_LPT_TYPE_BITS) - 1, +}; + +/* + * UBIFS inode types. + * + * UBIFS_ITYPE_REG: regular file + * UBIFS_ITYPE_DIR: directory + * UBIFS_ITYPE_LNK: soft link + * UBIFS_ITYPE_BLK: block device node + * UBIFS_ITYPE_CHR: character device node + * UBIFS_ITYPE_FIFO: fifo + * UBIFS_ITYPE_SOCK: socket + * UBIFS_ITYPES_CNT: count of supported file types + */ +enum { + UBIFS_ITYPE_REG, + UBIFS_ITYPE_DIR, + UBIFS_ITYPE_LNK, + UBIFS_ITYPE_BLK, + UBIFS_ITYPE_CHR, + UBIFS_ITYPE_FIFO, + UBIFS_ITYPE_SOCK, + UBIFS_ITYPES_CNT, +}; + +/* + * Supported key hash functions. + * + * UBIFS_KEY_HASH_R5: R5 hash + * UBIFS_KEY_HASH_TEST: test hash which just returns first 4 bytes of the name + */ +enum { + UBIFS_KEY_HASH_R5, + UBIFS_KEY_HASH_TEST, +}; + +/* + * Supported key formats. + * + * UBIFS_SIMPLE_KEY_FMT: simple key format + */ +enum { + UBIFS_SIMPLE_KEY_FMT, +}; + +/* + * The simple key format uses 29 bits for storing UBIFS block number and hash + * value. + */ +#define UBIFS_S_KEY_BLOCK_BITS 29 +#define UBIFS_S_KEY_BLOCK_MASK 0x1FFFFFFF +#define UBIFS_S_KEY_HASH_BITS UBIFS_S_KEY_BLOCK_BITS +#define UBIFS_S_KEY_HASH_MASK UBIFS_S_KEY_BLOCK_MASK + +/* + * Key types. + * + * UBIFS_INO_KEY: inode node key + * UBIFS_DATA_KEY: data node key + * UBIFS_DENT_KEY: directory entry node key + * UBIFS_XENT_KEY: extended attribute entry key + * UBIFS_KEY_TYPES_CNT: number of supported key types + */ +enum { + UBIFS_INO_KEY, + UBIFS_DATA_KEY, + UBIFS_DENT_KEY, + UBIFS_XENT_KEY, + UBIFS_KEY_TYPES_CNT, +}; + +/* Count of LEBs reserved for the superblock area */ +#define UBIFS_SB_LEBS 1 +/* Count of LEBs reserved for the master area */ +#define UBIFS_MST_LEBS 2 + +/* First LEB of the superblock area */ +#define UBIFS_SB_LNUM 0 +/* First LEB of the master area */ +#define UBIFS_MST_LNUM (UBIFS_SB_LNUM + UBIFS_SB_LEBS) +/* First LEB of the log area */ +#define UBIFS_LOG_LNUM (UBIFS_MST_LNUM + UBIFS_MST_LEBS) + +/* + * The below constants define the absolute minimum values for various UBIFS + * media areas. Many of them actually depend of flash geometry and the FS + * configuration (number of journal heads, orphan LEBs, etc). This means that + * the smallest volume size which can be used for UBIFS cannot be pre-defined + * by these constants. The file-system that meets the below limitation will not + * necessarily mount. UBIFS does run-time calculations and validates the FS + * size. + */ + +/* Minimum number of logical eraseblocks in the log */ +#define UBIFS_MIN_LOG_LEBS 2 +/* Minimum number of bud logical eraseblocks (one for each head) */ +#define UBIFS_MIN_BUD_LEBS 3 +/* Minimum number of journal logical eraseblocks */ +#define UBIFS_MIN_JNL_LEBS (UBIFS_MIN_LOG_LEBS + UBIFS_MIN_BUD_LEBS) +/* Minimum number of LPT area logical eraseblocks */ +#define UBIFS_MIN_LPT_LEBS 2 +/* Minimum number of orphan area logical eraseblocks */ +#define UBIFS_MIN_ORPH_LEBS 1 +/* + * Minimum number of main area logical eraseblocks (buds, 3 for the index, 1 + * for GC, 1 for deletions, and at least 1 for committed data). + */ +#define UBIFS_MIN_MAIN_LEBS (UBIFS_MIN_BUD_LEBS + 6) + +/* Minimum number of logical eraseblocks */ +#define UBIFS_MIN_LEB_CNT (UBIFS_SB_LEBS + UBIFS_MST_LEBS + \ + UBIFS_MIN_LOG_LEBS + UBIFS_MIN_LPT_LEBS + \ + UBIFS_MIN_ORPH_LEBS + UBIFS_MIN_MAIN_LEBS) + +/* Node sizes (N.B. these are guaranteed to be multiples of 8) */ +#define UBIFS_CH_SZ sizeof(struct ubifs_ch) +#define UBIFS_INO_NODE_SZ sizeof(struct ubifs_ino_node) +#define UBIFS_DATA_NODE_SZ sizeof(struct ubifs_data_node) +#define UBIFS_DENT_NODE_SZ sizeof(struct ubifs_dent_node) +#define UBIFS_TRUN_NODE_SZ sizeof(struct ubifs_trun_node) +#define UBIFS_PAD_NODE_SZ sizeof(struct ubifs_pad_node) +#define UBIFS_SB_NODE_SZ sizeof(struct ubifs_sb_node) +#define UBIFS_MST_NODE_SZ sizeof(struct ubifs_mst_node) +#define UBIFS_REF_NODE_SZ sizeof(struct ubifs_ref_node) +#define UBIFS_IDX_NODE_SZ sizeof(struct ubifs_idx_node) +#define UBIFS_CS_NODE_SZ sizeof(struct ubifs_cs_node) +#define UBIFS_ORPH_NODE_SZ sizeof(struct ubifs_orph_node) +/* Extended attribute entry nodes are identical to directory entry nodes */ +#define UBIFS_XENT_NODE_SZ UBIFS_DENT_NODE_SZ +/* Only this does not have to be multiple of 8 bytes */ +#define UBIFS_BRANCH_SZ sizeof(struct ubifs_branch) + +/* Maximum node sizes (N.B. these are guaranteed to be multiples of 8) */ +#define UBIFS_MAX_DATA_NODE_SZ (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE) +#define UBIFS_MAX_INO_NODE_SZ (UBIFS_INO_NODE_SZ + UBIFS_MAX_INO_DATA) +#define UBIFS_MAX_DENT_NODE_SZ (UBIFS_DENT_NODE_SZ + UBIFS_MAX_NLEN + 1) +#define UBIFS_MAX_XENT_NODE_SZ UBIFS_MAX_DENT_NODE_SZ + +/* The largest UBIFS node */ +#define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ + +/* + * On-flash inode flags. + * + * UBIFS_COMPR_FL: use compression for this inode + * UBIFS_SYNC_FL: I/O on this inode has to be synchronous + * UBIFS_IMMUTABLE_FL: inode is immutable + * UBIFS_APPEND_FL: writes to the inode may only append data + * UBIFS_DIRSYNC_FL: I/O on this directory inode has to be synchronous + * UBIFS_XATTR_FL: this inode is the inode for an extended attribute value + * + * Note, these are on-flash flags which correspond to ioctl flags + * (@FS_COMPR_FL, etc). They have the same values now, but generally, do not + * have to be the same. + */ +enum { + UBIFS_COMPR_FL = 0x01, + UBIFS_SYNC_FL = 0x02, + UBIFS_IMMUTABLE_FL = 0x04, + UBIFS_APPEND_FL = 0x08, + UBIFS_DIRSYNC_FL = 0x10, + UBIFS_XATTR_FL = 0x20, +}; + +/* Inode flag bits used by UBIFS */ +#define UBIFS_FL_MASK 0x0000001F + +/* + * UBIFS compression types. + * + * UBIFS_COMPR_NONE: no compression + * UBIFS_COMPR_LZO: LZO compression + * UBIFS_COMPR_ZLIB: ZLIB compression + * UBIFS_COMPR_TYPES_CNT: count of supported compression types + */ +enum { + UBIFS_COMPR_NONE, + UBIFS_COMPR_LZO, + UBIFS_COMPR_ZLIB, + UBIFS_COMPR_TYPES_CNT, +}; + +/* + * UBIFS node types. + * + * UBIFS_INO_NODE: inode node + * UBIFS_DATA_NODE: data node + * UBIFS_DENT_NODE: directory entry node + * UBIFS_XENT_NODE: extended attribute node + * UBIFS_TRUN_NODE: truncation node + * UBIFS_PAD_NODE: padding node + * UBIFS_SB_NODE: superblock node + * UBIFS_MST_NODE: master node + * UBIFS_REF_NODE: LEB reference node + * UBIFS_IDX_NODE: index node + * UBIFS_CS_NODE: commit start node + * UBIFS_ORPH_NODE: orphan node + * UBIFS_NODE_TYPES_CNT: count of supported node types + * + * Note, we index arrays by these numbers, so keep them low and contiguous. + * Node type constants for inodes, direntries and so on have to be the same as + * corresponding key type constants. + */ +enum { + UBIFS_INO_NODE, + UBIFS_DATA_NODE, + UBIFS_DENT_NODE, + UBIFS_XENT_NODE, + UBIFS_TRUN_NODE, + UBIFS_PAD_NODE, + UBIFS_SB_NODE, + UBIFS_MST_NODE, + UBIFS_REF_NODE, + UBIFS_IDX_NODE, + UBIFS_CS_NODE, + UBIFS_ORPH_NODE, + UBIFS_NODE_TYPES_CNT, +}; + +/* + * Master node flags. + * + * UBIFS_MST_DIRTY: rebooted uncleanly - master node is dirty + * UBIFS_MST_NO_ORPHS: no orphan inodes present + * UBIFS_MST_RCVRY: written by recovery + */ +enum { + UBIFS_MST_DIRTY = 1, + UBIFS_MST_NO_ORPHS = 2, + UBIFS_MST_RCVRY = 4, +}; + +/* + * Node group type (used by recovery to recover whole group or none). + * + * UBIFS_NO_NODE_GROUP: this node is not part of a group + * UBIFS_IN_NODE_GROUP: this node is a part of a group + * UBIFS_LAST_OF_NODE_GROUP: this node is the last in a group + */ +enum { + UBIFS_NO_NODE_GROUP = 0, + UBIFS_IN_NODE_GROUP, + UBIFS_LAST_OF_NODE_GROUP, +}; + +/* + * Superblock flags. + * + * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set + */ +enum { + UBIFS_FLG_BIGLPT = 0x02, +}; + +/** + * struct ubifs_ch - common header node. + * @magic: UBIFS node magic number (%UBIFS_NODE_MAGIC) + * @crc: CRC-32 checksum of the node header + * @sqnum: sequence number + * @len: full node length + * @node_type: node type + * @group_type: node group type + * @padding: reserved for future, zeroes + * + * Every UBIFS node starts with this common part. If the node has a key, the + * key always goes next. + */ +struct ubifs_ch { + __le32 magic; + __le32 crc; + __le64 sqnum; + __le32 len; + __u8 node_type; + __u8 group_type; + __u8 padding[2]; +} __attribute__ ((packed)); + +/** + * union ubifs_dev_desc - device node descriptor. + * @new: new type device descriptor + * @huge: huge type device descriptor + * + * This data structure describes major/minor numbers of a device node. In an + * inode is a device node then its data contains an object of this type. UBIFS + * uses standard Linux "new" and "huge" device node encodings. + */ +union ubifs_dev_desc { + __le32 new; + __le64 huge; +} __attribute__ ((packed)); + +/** + * struct ubifs_ino_node - inode node. + * @ch: common header + * @key: node key + * @creat_sqnum: sequence number at time of creation + * @size: inode size in bytes (amount of uncompressed data) + * @atime_sec: access time seconds + * @ctime_sec: creation time seconds + * @mtime_sec: modification time seconds + * @atime_nsec: access time nanoseconds + * @ctime_nsec: creation time nanoseconds + * @mtime_nsec: modification time nanoseconds + * @nlink: number of hard links + * @uid: owner ID + * @gid: group ID + * @mode: access flags + * @flags: per-inode flags (%UBIFS_COMPR_FL, %UBIFS_SYNC_FL, etc) + * @data_len: inode data length + * @xattr_cnt: count of extended attributes this inode has + * @xattr_size: summarized size of all extended attributes in bytes + * @padding1: reserved for future, zeroes + * @xattr_names: sum of lengths of all extended attribute names belonging to + * this inode + * @compr_type: compression type used for this inode + * @padding2: reserved for future, zeroes + * @data: data attached to the inode + * + * Note, even though inode compression type is defined by @compr_type, some + * nodes of this inode may be compressed with different compressor - this + * happens if compression type is changed while the inode already has data + * nodes. But @compr_type will be use for further writes to the inode. + * + * Note, do not forget to amend 'zero_ino_node_unused()' function when changing + * the padding fields. + */ +struct ubifs_ino_node { + struct ubifs_ch ch; + __u8 key[UBIFS_MAX_KEY_LEN]; + __le64 creat_sqnum; + __le64 size; + __le64 atime_sec; + __le64 ctime_sec; + __le64 mtime_sec; + __le32 atime_nsec; + __le32 ctime_nsec; + __le32 mtime_nsec; + __le32 nlink; + __le32 uid; + __le32 gid; + __le32 mode; + __le32 flags; + __le32 data_len; + __le32 xattr_cnt; + __le32 xattr_size; + __u8 padding1[4]; /* Watch 'zero_ino_node_unused()' if changing! */ + __le32 xattr_names; + __le16 compr_type; + __u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */ + __u8 data[]; +} __attribute__ ((packed)); + +/** + * struct ubifs_dent_node - directory entry node. + * @ch: common header + * @key: node key + * @inum: target inode number + * @padding1: reserved for future, zeroes + * @type: type of the target inode (%UBIFS_ITYPE_REG, %UBIFS_ITYPE_DIR, etc) + * @nlen: name length + * @padding2: reserved for future, zeroes + * @name: zero-terminated name + * + * Note, do not forget to amend 'zero_dent_node_unused()' function when + * changing the padding fields. + */ +struct ubifs_dent_node { + struct ubifs_ch ch; + __u8 key[UBIFS_MAX_KEY_LEN]; + __le64 inum; + __u8 padding1; + __u8 type; + __le16 nlen; + __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */ + __u8 name[]; +} __attribute__ ((packed)); + +/** + * struct ubifs_data_node - data node. + * @ch: common header + * @key: node key + * @size: uncompressed data size in bytes + * @compr_type: compression type (%UBIFS_COMPR_NONE, %UBIFS_COMPR_LZO, etc) + * @padding: reserved for future, zeroes + * @data: data + * + * Note, do not forget to amend 'zero_data_node_unused()' function when + * changing the padding fields. + */ +struct ubifs_data_node { + struct ubifs_ch ch; + __u8 key[UBIFS_MAX_KEY_LEN]; + __le32 size; + __le16 compr_type; + __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */ + __u8 data[]; +} __attribute__ ((packed)); + +/** + * struct ubifs_trun_node - truncation node. + * @ch: common header + * @inum: truncated inode number + * @padding: reserved for future, zeroes + * @old_size: size before truncation + * @new_size: size after truncation + * + * This node exists only in the journal and never goes to the main area. Note, + * do not forget to amend 'zero_trun_node_unused()' function when changing the + * padding fields. + */ +struct ubifs_trun_node { + struct ubifs_ch ch; + __le32 inum; + __u8 padding[12]; /* Watch 'zero_trun_node_unused()' if changing! */ + __le64 old_size; + __le64 new_size; +} __attribute__ ((packed)); + +/** + * struct ubifs_pad_node - padding node. + * @ch: common header + * @pad_len: how many bytes after this node are unused (because padded) + * @padding: reserved for future, zeroes + */ +struct ubifs_pad_node { + struct ubifs_ch ch; + __le32 pad_len; +} __attribute__ ((packed)); + +/** + * struct ubifs_sb_node - superblock node. + * @ch: common header + * @padding: reserved for future, zeroes + * @key_hash: type of hash function used in keys + * @key_fmt: format of the key + * @flags: file-system flags (%UBIFS_FLG_BIGLPT, etc) + * @min_io_size: minimal input/output unit size + * @leb_size: logical eraseblock size in bytes + * @leb_cnt: count of LEBs used by file-system + * @max_leb_cnt: maximum count of LEBs used by file-system + * @max_bud_bytes: maximum amount of data stored in buds + * @log_lebs: log size in logical eraseblocks + * @lpt_lebs: number of LEBs used for lprops table + * @orph_lebs: number of LEBs used for recording orphans + * @jhead_cnt: count of journal heads + * @fanout: tree fanout (max. number of links per indexing node) + * @lsave_cnt: number of LEB numbers in LPT's save table + * @fmt_version: UBIFS on-flash format version + * @default_compr: default compression + * @padding1: reserved for future, zeroes + * @rp_uid: reserve pool UID + * @rp_gid: reserve pool GID + * @rp_size: size of the reserved pool in bytes + * @padding2: reserved for future, zeroes + * @time_gran: time granularity in nanoseconds + * @uuid: UUID generated when the file system image was created + * @ro_compat_version: UBIFS R/O compatibility version + */ +struct ubifs_sb_node { + struct ubifs_ch ch; + __u8 padding[2]; + __u8 key_hash; + __u8 key_fmt; + __le32 flags; + __le32 min_io_size; + __le32 leb_size; + __le32 leb_cnt; + __le32 max_leb_cnt; + __le64 max_bud_bytes; + __le32 log_lebs; + __le32 lpt_lebs; + __le32 orph_lebs; + __le32 jhead_cnt; + __le32 fanout; + __le32 lsave_cnt; + __le32 fmt_version; + __le16 default_compr; + __u8 padding1[2]; + __le32 rp_uid; + __le32 rp_gid; + __le64 rp_size; + __le32 time_gran; + __u8 uuid[16]; + __le32 ro_compat_version; + __u8 padding2[3968]; +} __attribute__ ((packed)); + +/** + * struct ubifs_mst_node - master node. + * @ch: common header + * @highest_inum: highest inode number in the committed index + * @cmt_no: commit number + * @flags: various flags (%UBIFS_MST_DIRTY, etc) + * @log_lnum: start of the log + * @root_lnum: LEB number of the root indexing node + * @root_offs: offset within @root_lnum + * @root_len: root indexing node length + * @gc_lnum: LEB reserved for garbage collection (%-1 value means the LEB was + * not reserved and should be reserved on mount) + * @ihead_lnum: LEB number of index head + * @ihead_offs: offset of index head + * @index_size: size of index on flash + * @total_free: total free space in bytes + * @total_dirty: total dirty space in bytes + * @total_used: total used space in bytes (includes only data LEBs) + * @total_dead: total dead space in bytes (includes only data LEBs) + * @total_dark: total dark space in bytes (includes only data LEBs) + * @lpt_lnum: LEB number of LPT root nnode + * @lpt_offs: offset of LPT root nnode + * @nhead_lnum: LEB number of LPT head + * @nhead_offs: offset of LPT head + * @ltab_lnum: LEB number of LPT's own lprops table + * @ltab_offs: offset of LPT's own lprops table + * @lsave_lnum: LEB number of LPT's save table (big model only) + * @lsave_offs: offset of LPT's save table (big model only) + * @lscan_lnum: LEB number of last LPT scan + * @empty_lebs: number of empty logical eraseblocks + * @idx_lebs: number of indexing logical eraseblocks + * @leb_cnt: count of LEBs used by file-system + * @padding: reserved for future, zeroes + */ +struct ubifs_mst_node { + struct ubifs_ch ch; + __le64 highest_inum; + __le64 cmt_no; + __le32 flags; + __le32 log_lnum; + __le32 root_lnum; + __le32 root_offs; + __le32 root_len; + __le32 gc_lnum; + __le32 ihead_lnum; + __le32 ihead_offs; + __le64 index_size; + __le64 total_free; + __le64 total_dirty; + __le64 total_used; + __le64 total_dead; + __le64 total_dark; + __le32 lpt_lnum; + __le32 lpt_offs; + __le32 nhead_lnum; + __le32 nhead_offs; + __le32 ltab_lnum; + __le32 ltab_offs; + __le32 lsave_lnum; + __le32 lsave_offs; + __le32 lscan_lnum; + __le32 empty_lebs; + __le32 idx_lebs; + __le32 leb_cnt; + __u8 padding[344]; +} __attribute__ ((packed)); + +/** + * struct ubifs_ref_node - logical eraseblock reference node. + * @ch: common header + * @lnum: the referred logical eraseblock number + * @offs: start offset in the referred LEB + * @jhead: journal head number + * @padding: reserved for future, zeroes + */ +struct ubifs_ref_node { + struct ubifs_ch ch; + __le32 lnum; + __le32 offs; + __le32 jhead; + __u8 padding[28]; +} __attribute__ ((packed)); + +/** + * struct ubifs_branch - key/reference/length branch + * @lnum: LEB number of the target node + * @offs: offset within @lnum + * @len: target node length + * @key: key + */ +struct ubifs_branch { + __le32 lnum; + __le32 offs; + __le32 len; + __u8 key[]; +} __attribute__ ((packed)); + +/** + * struct ubifs_idx_node - indexing node. + * @ch: common header + * @child_cnt: number of child index nodes + * @level: tree level + * @branches: LEB number / offset / length / key branches + */ +struct ubifs_idx_node { + struct ubifs_ch ch; + __le16 child_cnt; + __le16 level; + __u8 branches[]; +} __attribute__ ((packed)); + +/** + * struct ubifs_cs_node - commit start node. + * @ch: common header + * @cmt_no: commit number + */ +struct ubifs_cs_node { + struct ubifs_ch ch; + __le64 cmt_no; +} __attribute__ ((packed)); + +/** + * struct ubifs_orph_node - orphan node. + * @ch: common header + * @cmt_no: commit number (also top bit is set on the last node of the commit) + * @inos: inode numbers of orphans + */ +struct ubifs_orph_node { + struct ubifs_ch ch; + __le64 cmt_no; + __le64 inos[]; +} __attribute__ ((packed)); + +#endif /* __UBIFS_MEDIA_H__ */ diff -Nurd linux-2.6.24/fs/ubifs/ubifs.h ubifs-v2.6.24/fs/ubifs/ubifs.h --- linux-2.6.24/fs/ubifs/ubifs.h 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/ubifs.h 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,1729 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +#ifndef __UBIFS_H__ +#define __UBIFS_H__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ubifs-media.h" + +/* Version of this UBIFS implementation */ +#define UBIFS_VERSION 1 + +/* Normal UBIFS messages */ +#define ubifs_msg(fmt, ...) \ + printk(KERN_NOTICE "UBIFS: " fmt "\n", ##__VA_ARGS__) +/* UBIFS error messages */ +#define ubifs_err(fmt, ...) \ + printk(KERN_ERR "UBIFS error (pid %d): %s: " fmt "\n", current->pid, \ + __func__, ##__VA_ARGS__) +/* UBIFS warning messages */ +#define ubifs_warn(fmt, ...) \ + printk(KERN_WARNING "UBIFS warning (pid %d): %s: " fmt "\n", \ + current->pid, __func__, ##__VA_ARGS__) + +/* UBIFS file system VFS magic number */ +#define UBIFS_SUPER_MAGIC 0x24051905 + +/* Number of UBIFS blocks per VFS page */ +#define UBIFS_BLOCKS_PER_PAGE (PAGE_CACHE_SIZE / UBIFS_BLOCK_SIZE) +#define UBIFS_BLOCKS_PER_PAGE_SHIFT (PAGE_CACHE_SHIFT - UBIFS_BLOCK_SHIFT) + +/* "File system end of life" sequence number watermark */ +#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL +#define SQNUM_WATERMARK 0xFFFFFFFFFF000000ULL + +/* + * Minimum amount of LEBs reserved for the index. At present the index needs at + * least 2 LEBs: one for the index head and one for in-the-gaps method (which + * currently does not cater for the index head and so excludes it from + * consideration). + */ +#define MIN_INDEX_LEBS 2 + +/* Minimum amount of data UBIFS writes to the flash */ +#define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8) + +/* + * Currently we do not support inode number overlapping and re-using, so this + * watermark defines dangerous inode number level. This should be fixed later, + * although it is difficult to exceed current limit. Another option is to use + * 64-bit inode numbers, but this means more overhead. + */ +#define INUM_WARN_WATERMARK 0xFFF00000 +#define INUM_WATERMARK 0xFFFFFF00 + +/* Largest key size supported in this implementation */ +#define CUR_MAX_KEY_LEN UBIFS_SK_LEN + +/* Maximum number of entries in each LPT (LEB category) heap */ +#define LPT_HEAP_SZ 256 + +/* + * Background thread name pattern. The numbers are UBI device and volume + * numbers. + */ +#define BGT_NAME_PATTERN "ubifs_bgt%d_%d" + +/* Default write-buffer synchronization timeout (5 secs) */ +#define DEFAULT_WBUF_TIMEOUT (5 * HZ) + +/* Maximum possible inode number (only 32-bit inodes are supported now) */ +#define MAX_INUM 0xFFFFFFFF + +/* Number of non-data journal heads */ +#define NONDATA_JHEADS_CNT 2 + +/* Garbage collector head */ +#define GCHD 0 +/* Base journal head number */ +#define BASEHD 1 +/* First "general purpose" journal head */ +#define DATAHD 2 + +/* 'No change' value for 'ubifs_change_lp()' */ +#define LPROPS_NC 0x80000001 + +/* + * There is no notion of truncation key because truncation nodes do not exist + * in TNC. However, when replaying, it is handy to introduce fake "truncation" + * keys for truncation nodes because the code becomes simpler. So we define + * %UBIFS_TRUN_KEY type. + */ +#define UBIFS_TRUN_KEY UBIFS_KEY_TYPES_CNT + +/* + * How much a directory entry/extended attribute entry adds to the parent/host + * inode. + */ +#define CALC_DENT_SIZE(name_len) ALIGN(UBIFS_DENT_NODE_SZ + (name_len) + 1, 8) + +/* How much an extended attribute adds to the host inode */ +#define CALC_XATTR_BYTES(data_len) ALIGN(UBIFS_INO_NODE_SZ + (data_len) + 1, 8) + +/* + * Znodes which were not touched for 'OLD_ZNODE_AGE' seconds are considered + * "old", and znode which were touched last 'YOUNG_ZNODE_AGE' seconds ago are + * considered "young". This is used by shrinker when selecting znode to trim + * off. + */ +#define OLD_ZNODE_AGE 20 +#define YOUNG_ZNODE_AGE 5 + +/* + * Some compressors, like LZO, may end up with more data then the input buffer. + * So UBIFS always allocates larger output buffer, to be sure the compressor + * will not corrupt memory in case of worst case compression. + */ +#define WORST_COMPR_FACTOR 2 + +/* Maximum expected tree height for use by bottom_up_buf */ +#define BOTTOM_UP_HEIGHT 64 + +/* Maximum number of data nodes to bulk-read */ +#define UBIFS_MAX_BULK_READ 32 + +/* + * Lockdep classes for UBIFS inode @ui_mutex. + */ +enum { + WB_MUTEX_1 = 0, + WB_MUTEX_2 = 1, + WB_MUTEX_3 = 2, +}; + +/* + * Znode flags (actually, bit numbers which store the flags). + * + * DIRTY_ZNODE: znode is dirty + * COW_ZNODE: znode is being committed and a new instance of this znode has to + * be created before changing this znode + * OBSOLETE_ZNODE: znode is obsolete, which means it was deleted, but it is + * still in the commit list and the ongoing commit operation + * will commit it, and delete this znode after it is done + */ +enum { + DIRTY_ZNODE = 0, + COW_ZNODE = 1, + OBSOLETE_ZNODE = 2, +}; + +/* + * Commit states. + * + * COMMIT_RESTING: commit is not wanted + * COMMIT_BACKGROUND: background commit has been requested + * COMMIT_REQUIRED: commit is required + * COMMIT_RUNNING_BACKGROUND: background commit is running + * COMMIT_RUNNING_REQUIRED: commit is running and it is required + * COMMIT_BROKEN: commit failed + */ +enum { + COMMIT_RESTING = 0, + COMMIT_BACKGROUND, + COMMIT_REQUIRED, + COMMIT_RUNNING_BACKGROUND, + COMMIT_RUNNING_REQUIRED, + COMMIT_BROKEN, +}; + +/* + * 'ubifs_scan_a_node()' return values. + * + * SCANNED_GARBAGE: scanned garbage + * SCANNED_EMPTY_SPACE: scanned empty space + * SCANNED_A_NODE: scanned a valid node + * SCANNED_A_CORRUPT_NODE: scanned a corrupted node + * SCANNED_A_BAD_PAD_NODE: scanned a padding node with invalid pad length + * + * Greater than zero means: 'scanned that number of padding bytes' + */ +enum { + SCANNED_GARBAGE = 0, + SCANNED_EMPTY_SPACE = -1, + SCANNED_A_NODE = -2, + SCANNED_A_CORRUPT_NODE = -3, + SCANNED_A_BAD_PAD_NODE = -4, +}; + +/* + * LPT cnode flag bits. + * + * DIRTY_CNODE: cnode is dirty + * COW_CNODE: cnode is being committed and must be copied before writing + * OBSOLETE_CNODE: cnode is being committed and has been copied (or deleted), + * so it can (and must) be freed when the commit is finished + */ +enum { + DIRTY_CNODE = 0, + COW_CNODE = 1, + OBSOLETE_CNODE = 2, +}; + +/* + * Dirty flag bits (lpt_drty_flgs) for LPT special nodes. + * + * LTAB_DIRTY: ltab node is dirty + * LSAVE_DIRTY: lsave node is dirty + */ +enum { + LTAB_DIRTY = 1, + LSAVE_DIRTY = 2, +}; + +/* + * Return codes used by the garbage collector. + * @LEB_FREED: the logical eraseblock was freed and is ready to use + * @LEB_FREED_IDX: indexing LEB was freed and can be used only after the commit + * @LEB_RETAINED: the logical eraseblock was freed and retained for GC purposes + */ +enum { + LEB_FREED, + LEB_FREED_IDX, + LEB_RETAINED, +}; + +/** + * struct ubifs_old_idx - index node obsoleted since last commit start. + * @rb: rb-tree node + * @lnum: LEB number of obsoleted index node + * @offs: offset of obsoleted index node + */ +struct ubifs_old_idx { + struct rb_node rb; + int lnum; + int offs; +}; + +/* The below union makes it easier to deal with keys */ +union ubifs_key { + uint8_t u8[CUR_MAX_KEY_LEN]; + uint32_t u32[CUR_MAX_KEY_LEN/4]; + uint64_t u64[CUR_MAX_KEY_LEN/8]; + __le32 j32[CUR_MAX_KEY_LEN/4]; +}; + +/** + * struct ubifs_scan_node - UBIFS scanned node information. + * @list: list of scanned nodes + * @key: key of node scanned (if it has one) + * @sqnum: sequence number + * @type: type of node scanned + * @offs: offset with LEB of node scanned + * @len: length of node scanned + * @node: raw node + */ +struct ubifs_scan_node { + struct list_head list; + union ubifs_key key; + unsigned long long sqnum; + int type; + int offs; + int len; + void *node; +}; + +/** + * struct ubifs_scan_leb - UBIFS scanned LEB information. + * @lnum: logical eraseblock number + * @nodes_cnt: number of nodes scanned + * @nodes: list of struct ubifs_scan_node + * @endpt: end point (and therefore the start of empty space) + * @ecc: read returned -EBADMSG + * @buf: buffer containing entire LEB scanned + */ +struct ubifs_scan_leb { + int lnum; + int nodes_cnt; + struct list_head nodes; + int endpt; + int ecc; + void *buf; +}; + +/** + * struct ubifs_gced_idx_leb - garbage-collected indexing LEB. + * @list: list + * @lnum: LEB number + * @unmap: OK to unmap this LEB + * + * This data structure is used to temporary store garbage-collected indexing + * LEBs - they are not released immediately, but only after the next commit. + * This is needed to guarantee recoverability. + */ +struct ubifs_gced_idx_leb { + struct list_head list; + int lnum; + int unmap; +}; + +/** + * struct ubifs_inode - UBIFS in-memory inode description. + * @vfs_inode: VFS inode description object + * @creat_sqnum: sequence number at time of creation + * @del_cmtno: commit number corresponding to the time the inode was deleted, + * protected by @c->commit_sem; + * @xattr_size: summarized size of all extended attributes in bytes + * @xattr_cnt: count of extended attributes this inode has + * @xattr_names: sum of lengths of all extended attribute names belonging to + * this inode + * @dirty: non-zero if the inode is dirty + * @xattr: non-zero if this is an extended attribute inode + * @bulk_read: non-zero if bulk-read should be used + * @ui_mutex: serializes inode write-back with the rest of VFS operations, + * serializes "clean <-> dirty" state changes, serializes bulk-read, + * protects @dirty, @bulk_read, @ui_size, and @xattr_size + * @ui_lock: protects @synced_i_size + * @synced_i_size: synchronized size of inode, i.e. the value of inode size + * currently stored on the flash; used only for regular file + * inodes + * @ui_size: inode size used by UBIFS when writing to flash + * @flags: inode flags (@UBIFS_COMPR_FL, etc) + * @compr_type: default compression type used for this inode + * @last_page_read: page number of last page read (for bulk read) + * @read_in_a_row: number of consecutive pages read in a row (for bulk read) + * @data_len: length of the data attached to the inode + * @data: inode's data + * + * @ui_mutex exists for two main reasons. At first it prevents inodes from + * being written back while UBIFS changing them, being in the middle of an VFS + * operation. This way UBIFS makes sure the inode fields are consistent. For + * example, in 'ubifs_rename()' we change 3 inodes simultaneously, and + * write-back must not write any of them before we have finished. + * + * The second reason is budgeting - UBIFS has to budget all operations. If an + * operation is going to mark an inode dirty, it has to allocate budget for + * this. It cannot just mark it dirty because there is no guarantee there will + * be enough flash space to write the inode back later. This means UBIFS has + * to have full control over inode "clean <-> dirty" transitions (and pages + * actually). But unfortunately, VFS marks inodes dirty in many places, and it + * does not ask the file-system if it is allowed to do so (there is a notifier, + * but it is not enough), i.e., there is no mechanism to synchronize with this. + * So UBIFS has its own inode dirty flag and its own mutex to serialize + * "clean <-> dirty" transitions. + * + * The @synced_i_size field is used to make sure we never write pages which are + * beyond last synchronized inode size. See 'ubifs_writepage()' for more + * information. + * + * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses + * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot + * make sure @inode->i_size is always changed under @ui_mutex, because it + * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock + * with 'ubifs_writepage()' (see file.c). All the other inode fields are + * changed under @ui_mutex, so they do not need "shadow" fields. Note, one + * could consider to rework locking and base it on "shadow" fields. + */ +struct ubifs_inode { + struct inode vfs_inode; + unsigned long long creat_sqnum; + unsigned long long del_cmtno; + unsigned int xattr_size; + unsigned int xattr_cnt; + unsigned int xattr_names; + unsigned int dirty:1; + unsigned int xattr:1; + unsigned int bulk_read:1; + unsigned int compr_type:2; + struct mutex ui_mutex; + spinlock_t ui_lock; + loff_t synced_i_size; + loff_t ui_size; + int flags; + pgoff_t last_page_read; + pgoff_t read_in_a_row; + int data_len; + void *data; +}; + +/** + * struct ubifs_unclean_leb - records a LEB recovered under read-only mode. + * @list: list + * @lnum: LEB number of recovered LEB + * @endpt: offset where recovery ended + * + * This structure records a LEB identified during recovery that needs to be + * cleaned but was not because UBIFS was mounted read-only. The information + * is used to clean the LEB when remounting to read-write mode. + */ +struct ubifs_unclean_leb { + struct list_head list; + int lnum; + int endpt; +}; + +/* + * LEB properties flags. + * + * LPROPS_UNCAT: not categorized + * LPROPS_DIRTY: dirty > free, dirty >= @c->dead_wm, not index + * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index + * LPROPS_FREE: free > 0, dirty < @c->dead_wm, not empty, not index + * LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs + * LPROPS_EMPTY: LEB is empty, not taken + * LPROPS_FREEABLE: free + dirty == leb_size, not index, not taken + * LPROPS_FRDI_IDX: free + dirty == leb_size and index, may be taken + * LPROPS_CAT_MASK: mask for the LEB categories above + * LPROPS_TAKEN: LEB was taken (this flag is not saved on the media) + * LPROPS_INDEX: LEB contains indexing nodes (this flag also exists on flash) + */ +enum { + LPROPS_UNCAT = 0, + LPROPS_DIRTY = 1, + LPROPS_DIRTY_IDX = 2, + LPROPS_FREE = 3, + LPROPS_HEAP_CNT = 3, + LPROPS_EMPTY = 4, + LPROPS_FREEABLE = 5, + LPROPS_FRDI_IDX = 6, + LPROPS_CAT_MASK = 15, + LPROPS_TAKEN = 16, + LPROPS_INDEX = 32, +}; + +/** + * struct ubifs_lprops - logical eraseblock properties. + * @free: amount of free space in bytes + * @dirty: amount of dirty space in bytes + * @flags: LEB properties flags (see above) + * @lnum: LEB number + * @list: list of same-category lprops (for LPROPS_EMPTY and LPROPS_FREEABLE) + * @hpos: heap position in heap of same-category lprops (other categories) + */ +struct ubifs_lprops { + int free; + int dirty; + int flags; + int lnum; + union { + struct list_head list; + int hpos; + }; +}; + +/** + * struct ubifs_lpt_lprops - LPT logical eraseblock properties. + * @free: amount of free space in bytes + * @dirty: amount of dirty space in bytes + * @tgc: trivial GC flag (1 => unmap after commit end) + * @cmt: commit flag (1 => reserved for commit) + */ +struct ubifs_lpt_lprops { + int free; + int dirty; + unsigned tgc:1; + unsigned cmt:1; +}; + +/** + * struct ubifs_lp_stats - statistics of eraseblocks in the main area. + * @empty_lebs: number of empty LEBs + * @taken_empty_lebs: number of taken LEBs + * @idx_lebs: number of indexing LEBs + * @total_free: total free space in bytes (includes all LEBs) + * @total_dirty: total dirty space in bytes (includes all LEBs) + * @total_used: total used space in bytes (does not include index LEBs) + * @total_dead: total dead space in bytes (does not include index LEBs) + * @total_dark: total dark space in bytes (does not include index LEBs) + * + * The @taken_empty_lebs field counts the LEBs that are in the transient state + * of having been "taken" for use but not yet written to. @taken_empty_lebs is + * needed to account correctly for @gc_lnum, otherwise @empty_lebs could be + * used by itself (in which case 'unused_lebs' would be a better name). In the + * case of @gc_lnum, it is "taken" at mount time or whenever a LEB is retained + * by GC, but unlike other empty LEBs that are "taken", it may not be written + * straight away (i.e. before the next commit start or unmount), so either + * @gc_lnum must be specially accounted for, or the current approach followed + * i.e. count it under @taken_empty_lebs. + * + * @empty_lebs includes @taken_empty_lebs. + * + * @total_used, @total_dead and @total_dark fields do not account indexing + * LEBs. + */ +struct ubifs_lp_stats { + int empty_lebs; + int taken_empty_lebs; + int idx_lebs; + long long total_free; + long long total_dirty; + long long total_used; + long long total_dead; + long long total_dark; +}; + +struct ubifs_nnode; + +/** + * struct ubifs_cnode - LEB Properties Tree common node. + * @parent: parent nnode + * @cnext: next cnode to commit + * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE) + * @iip: index in parent + * @level: level in the tree (zero for pnodes, greater than zero for nnodes) + * @num: node number + */ +struct ubifs_cnode { + struct ubifs_nnode *parent; + struct ubifs_cnode *cnext; + unsigned long flags; + int iip; + int level; + int num; +}; + +/** + * struct ubifs_pnode - LEB Properties Tree leaf node. + * @parent: parent nnode + * @cnext: next cnode to commit + * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE) + * @iip: index in parent + * @level: level in the tree (always zero for pnodes) + * @num: node number + * @lprops: LEB properties array + */ +struct ubifs_pnode { + struct ubifs_nnode *parent; + struct ubifs_cnode *cnext; + unsigned long flags; + int iip; + int level; + int num; + struct ubifs_lprops lprops[UBIFS_LPT_FANOUT]; +}; + +/** + * struct ubifs_nbranch - LEB Properties Tree internal node branch. + * @lnum: LEB number of child + * @offs: offset of child + * @nnode: nnode child + * @pnode: pnode child + * @cnode: cnode child + */ +struct ubifs_nbranch { + int lnum; + int offs; + union { + struct ubifs_nnode *nnode; + struct ubifs_pnode *pnode; + struct ubifs_cnode *cnode; + }; +}; + +/** + * struct ubifs_nnode - LEB Properties Tree internal node. + * @parent: parent nnode + * @cnext: next cnode to commit + * @flags: flags (%DIRTY_LPT_NODE or %OBSOLETE_LPT_NODE) + * @iip: index in parent + * @level: level in the tree (always greater than zero for nnodes) + * @num: node number + * @nbranch: branches to child nodes + */ +struct ubifs_nnode { + struct ubifs_nnode *parent; + struct ubifs_cnode *cnext; + unsigned long flags; + int iip; + int level; + int num; + struct ubifs_nbranch nbranch[UBIFS_LPT_FANOUT]; +}; + +/** + * struct ubifs_lpt_heap - heap of categorized lprops. + * @arr: heap array + * @cnt: number in heap + * @max_cnt: maximum number allowed in heap + * + * There are %LPROPS_HEAP_CNT heaps. + */ +struct ubifs_lpt_heap { + struct ubifs_lprops **arr; + int cnt; + int max_cnt; +}; + +/* + * Return codes for LPT scan callback function. + * + * LPT_SCAN_CONTINUE: continue scanning + * LPT_SCAN_ADD: add the LEB properties scanned to the tree in memory + * LPT_SCAN_STOP: stop scanning + */ +enum { + LPT_SCAN_CONTINUE = 0, + LPT_SCAN_ADD = 1, + LPT_SCAN_STOP = 2, +}; + +struct ubifs_info; + +/* Callback used by the 'ubifs_lpt_scan_nolock()' function */ +typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c, + const struct ubifs_lprops *lprops, + int in_tree, void *data); + +/** + * struct ubifs_wbuf - UBIFS write-buffer. + * @c: UBIFS file-system description object + * @buf: write-buffer (of min. flash I/O unit size) + * @lnum: logical eraseblock number the write-buffer points to + * @offs: write-buffer offset in this logical eraseblock + * @avail: number of bytes available in the write-buffer + * @used: number of used bytes in the write-buffer + * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM, + * %UBI_UNKNOWN) + * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep + * up by 'mutex_lock_nested()). + * @sync_callback: write-buffer synchronization callback + * @io_mutex: serializes write-buffer I/O + * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes + * fields + * @timer: write-buffer timer + * @timeout: timer expire interval in jiffies + * @need_sync: it is set if its timer expired and needs sync + * @next_ino: points to the next position of the following inode number + * @inodes: stores the inode numbers of the nodes which are in wbuf + * + * The write-buffer synchronization callback is called when the write-buffer is + * synchronized in order to notify how much space was wasted due to + * write-buffer padding and how much free space is left in the LEB. + * + * Note: the fields @buf, @lnum, @offs, @avail and @used can be read under + * spin-lock or mutex because they are written under both mutex and spin-lock. + * @buf is appended to under mutex but overwritten under both mutex and + * spin-lock. Thus the data between @buf and @buf + @used can be read under + * spinlock. + */ +struct ubifs_wbuf { + struct ubifs_info *c; + void *buf; + int lnum; + int offs; + int avail; + int used; + int dtype; + int jhead; + int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); + struct mutex io_mutex; + spinlock_t lock; + struct timer_list timer; + int timeout; + int need_sync; + int next_ino; + ino_t *inodes; +}; + +/** + * struct ubifs_bud - bud logical eraseblock. + * @lnum: logical eraseblock number + * @start: where the (uncommitted) bud data starts + * @jhead: journal head number this bud belongs to + * @list: link in the list buds belonging to the same journal head + * @rb: link in the tree of all buds + */ +struct ubifs_bud { + int lnum; + int start; + int jhead; + struct list_head list; + struct rb_node rb; +}; + +/** + * struct ubifs_jhead - journal head. + * @wbuf: head's write-buffer + * @buds_list: list of bud LEBs belonging to this journal head + * + * Note, the @buds list is protected by the @c->buds_lock. + */ +struct ubifs_jhead { + struct ubifs_wbuf wbuf; + struct list_head buds_list; +}; + +/** + * struct ubifs_zbranch - key/coordinate/length branch stored in znodes. + * @key: key + * @znode: znode address in memory + * @lnum: LEB number of the target node (indexing node or data node) + * @offs: target node offset within @lnum + * @len: target node length + */ +struct ubifs_zbranch { + union ubifs_key key; + union { + struct ubifs_znode *znode; + void *leaf; + }; + int lnum; + int offs; + int len; +}; + +/** + * struct ubifs_znode - in-memory representation of an indexing node. + * @parent: parent znode or NULL if it is the root + * @cnext: next znode to commit + * @flags: znode flags (%DIRTY_ZNODE, %COW_ZNODE or %OBSOLETE_ZNODE) + * @time: last access time (seconds) + * @level: level of the entry in the TNC tree + * @child_cnt: count of child znodes + * @iip: index in parent's zbranch array + * @alt: lower bound of key range has altered i.e. child inserted at slot 0 + * @lnum: LEB number of the corresponding indexing node + * @offs: offset of the corresponding indexing node + * @len: length of the corresponding indexing node + * @zbranch: array of znode branches (@c->fanout elements) + */ +struct ubifs_znode { + struct ubifs_znode *parent; + struct ubifs_znode *cnext; + unsigned long flags; + unsigned long time; + int level; + int child_cnt; + int iip; + int alt; +#ifdef CONFIG_UBIFS_FS_DEBUG + int lnum, offs, len; +#endif + struct ubifs_zbranch zbranch[]; +}; + +/** + * struct bu_info - bulk-read information. + * @key: first data node key + * @zbranch: zbranches of data nodes to bulk read + * @buf: buffer to read into + * @buf_len: buffer length + * @gc_seq: GC sequence number to detect races with GC + * @cnt: number of data nodes for bulk read + * @blk_cnt: number of data blocks including holes + * @oef: end of file reached + */ +struct bu_info { + union ubifs_key key; + struct ubifs_zbranch zbranch[UBIFS_MAX_BULK_READ]; + void *buf; + int buf_len; + int gc_seq; + int cnt; + int blk_cnt; + int eof; +}; + +/** + * struct ubifs_node_range - node length range description data structure. + * @len: fixed node length + * @min_len: minimum possible node length + * @max_len: maximum possible node length + * + * If @max_len is %0, the node has fixed length @len. + */ +struct ubifs_node_range { + union { + int len; + int min_len; + }; + int max_len; +}; + +/** + * struct ubifs_compressor - UBIFS compressor description structure. + * @compr_type: compressor type (%UBIFS_COMPR_LZO, etc) + * @cc: cryptoapi compressor handle + * @comp_mutex: mutex used during compression + * @decomp_mutex: mutex used during decompression + * @name: compressor name + * @capi_name: cryptoapi compressor name + */ +struct ubifs_compressor { + int compr_type; + struct crypto_comp *cc; + struct mutex *comp_mutex; + struct mutex *decomp_mutex; + const char *name; + const char *capi_name; +}; + +/** + * struct ubifs_budget_req - budget requirements of an operation. + * + * @fast: non-zero if the budgeting should try to acquire budget quickly and + * should not try to call write-back + * @recalculate: non-zero if @idx_growth, @data_growth, and @dd_growth fields + * have to be re-calculated + * @new_page: non-zero if the operation adds a new page + * @dirtied_page: non-zero if the operation makes a page dirty + * @new_dent: non-zero if the operation adds a new directory entry + * @mod_dent: non-zero if the operation removes or modifies an existing + * directory entry + * @new_ino: non-zero if the operation adds a new inode + * @new_ino_d: now much data newly created inode contains + * @dirtied_ino: how many inodes the operation makes dirty + * @dirtied_ino_d: now much data dirtied inode contains + * @idx_growth: how much the index will supposedly grow + * @data_growth: how much new data the operation will supposedly add + * @dd_growth: how much data that makes other data dirty the operation will + * supposedly add + * + * @idx_growth, @data_growth and @dd_growth are not used in budget request. The + * budgeting subsystem caches index and data growth values there to avoid + * re-calculating them when the budget is released. However, if @idx_growth is + * %-1, it is calculated by the release function using other fields. + * + * An inode may contain 4KiB of data at max., thus the widths of @new_ino_d + * is 13 bits, and @dirtied_ino_d - 15, because up to 4 inodes may be made + * dirty by the re-name operation. + * + * Note, UBIFS aligns node lengths to 8-bytes boundary, so the requester has to + * make sure the amount of inode data which contribute to @new_ino_d and + * @dirtied_ino_d fields are aligned. + */ +struct ubifs_budget_req { + unsigned int fast:1; + unsigned int recalculate:1; +#ifndef UBIFS_DEBUG + unsigned int new_page:1; + unsigned int dirtied_page:1; + unsigned int new_dent:1; + unsigned int mod_dent:1; + unsigned int new_ino:1; + unsigned int new_ino_d:13; + unsigned int dirtied_ino:4; + unsigned int dirtied_ino_d:15; +#else + /* Not bit-fields to check for overflows */ + unsigned int new_page; + unsigned int dirtied_page; + unsigned int new_dent; + unsigned int mod_dent; + unsigned int new_ino; + unsigned int new_ino_d; + unsigned int dirtied_ino; + unsigned int dirtied_ino_d; +#endif + int idx_growth; + int data_growth; + int dd_growth; +}; + +/** + * struct ubifs_orphan - stores the inode number of an orphan. + * @rb: rb-tree node of rb-tree of orphans sorted by inode number + * @list: list head of list of orphans in order added + * @new_list: list head of list of orphans added since the last commit + * @cnext: next orphan to commit + * @dnext: next orphan to delete + * @inum: inode number + * @new: %1 => added since the last commit, otherwise %0 + */ +struct ubifs_orphan { + struct rb_node rb; + struct list_head list; + struct list_head new_list; + struct ubifs_orphan *cnext; + struct ubifs_orphan *dnext; + ino_t inum; + int new; +}; + +/** + * struct ubifs_mount_opts - UBIFS-specific mount options information. + * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast) + * @bulk_read: enable/disable bulk-reads (%0 default, %1 disabe, %2 enable) + * @chk_data_crc: enable/disable CRC data checking when reading data nodes + * (%0 default, %1 disabe, %2 enable) + * @override_compr: override default compressor (%0 - do not override and use + * superblock compressor, %1 - override and use compressor + * specified in @compr_type) + * @compr_type: compressor type to override the superblock compressor with + * (%UBIFS_COMPR_NONE, etc) + */ +struct ubifs_mount_opts { + unsigned int unmount_mode:2; + unsigned int bulk_read:2; + unsigned int chk_data_crc:2; + unsigned int override_compr:1; + unsigned int compr_type:2; +}; + +struct ubifs_debug_info; + +/** + * struct ubifs_info - UBIFS file-system description data structure + * (per-superblock). + * @vfs_sb: VFS @struct super_block object + * @bdi: backing device info object to make VFS happy and disable read-ahead + * + * @highest_inum: highest used inode number + * @max_sqnum: current global sequence number + * @cmt_no: commit number of the last successfully completed commit, protected + * by @commit_sem + * @cnt_lock: protects @highest_inum and @max_sqnum counters + * @fmt_version: UBIFS on-flash format version + * @ro_compat_version: R/O compatibility version + * @uuid: UUID from super block + * + * @lhead_lnum: log head logical eraseblock number + * @lhead_offs: log head offset + * @ltail_lnum: log tail logical eraseblock number (offset is always 0) + * @log_mutex: protects the log, @lhead_lnum, @lhead_offs, @ltail_lnum, and + * @bud_bytes + * @min_log_bytes: minimum required number of bytes in the log + * @cmt_bud_bytes: used during commit to temporarily amount of bytes in + * committed buds + * + * @buds: tree of all buds indexed by bud LEB number + * @bud_bytes: how many bytes of flash is used by buds + * @buds_lock: protects the @buds tree, @bud_bytes, and per-journal head bud + * lists + * @jhead_cnt: count of journal heads + * @jheads: journal heads (head zero is base head) + * @max_bud_bytes: maximum number of bytes allowed in buds + * @bg_bud_bytes: number of bud bytes when background commit is initiated + * @old_buds: buds to be released after commit ends + * @max_bud_cnt: maximum number of buds + * + * @commit_sem: synchronizes committer with other processes + * @cmt_state: commit state + * @cs_lock: commit state lock + * @cmt_wq: wait queue to sleep on if the log is full and a commit is running + * + * @big_lpt: flag that LPT is too big to write whole during commit + * @no_chk_data_crc: do not check CRCs when reading data nodes (except during + * recovery) + * @bulk_read: enable bulk-reads + * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) + * @rw_incompat: the media is not R/W compatible + * + * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and + * @calc_idx_sz + * @zroot: zbranch which points to the root index node and znode + * @cnext: next znode to commit + * @enext: next znode to commit to empty space + * @gap_lebs: array of LEBs used by the in-gaps commit method + * @cbuf: commit buffer + * @ileb_buf: buffer for commit in-the-gaps method + * @ileb_len: length of data in ileb_buf + * @ihead_lnum: LEB number of index head + * @ihead_offs: offset of index head + * @ilebs: pre-allocated index LEBs + * @ileb_cnt: number of pre-allocated index LEBs + * @ileb_nxt: next pre-allocated index LEBs + * @old_idx: tree of index nodes obsoleted since the last commit start + * @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c + * + * @mst_node: master node + * @mst_offs: offset of valid master node + * @mst_mutex: protects the master node area, @mst_node, and @mst_offs + * + * @max_bu_buf_len: maximum bulk-read buffer length + * @bu_mutex: protects the pre-allocated bulk-read buffer and @c->bu + * @bu: pre-allocated bulk-read information + * + * @log_lebs: number of logical eraseblocks in the log + * @log_bytes: log size in bytes + * @log_last: last LEB of the log + * @lpt_lebs: number of LEBs used for lprops table + * @lpt_first: first LEB of the lprops table area + * @lpt_last: last LEB of the lprops table area + * @orph_lebs: number of LEBs used for the orphan area + * @orph_first: first LEB of the orphan area + * @orph_last: last LEB of the orphan area + * @main_lebs: count of LEBs in the main area + * @main_first: first LEB of the main area + * @main_bytes: main area size in bytes + * + * @key_hash_type: type of the key hash + * @key_hash: direntry key hash function + * @key_fmt: key format + * @key_len: key length + * @fanout: fanout of the index tree (number of links per indexing node) + * + * @min_io_size: minimal input/output unit size + * @min_io_shift: number of bits in @min_io_size minus one + * @leb_size: logical eraseblock size in bytes + * @half_leb_size: half LEB size + * @idx_leb_size: how many bytes of an LEB are effectively available when it is + * used to store indexing nodes (@leb_size - @max_idx_node_sz) + * @leb_cnt: count of logical eraseblocks + * @max_leb_cnt: maximum count of logical eraseblocks + * @old_leb_cnt: count of logical eraseblocks before re-size + * @ro_media: the underlying UBI volume is read-only + * + * @dirty_pg_cnt: number of dirty pages (not used) + * @dirty_zn_cnt: number of dirty znodes + * @clean_zn_cnt: number of clean znodes + * + * @budg_idx_growth: amount of bytes budgeted for index growth + * @budg_data_growth: amount of bytes budgeted for cached data + * @budg_dd_growth: amount of bytes budgeted for cached data that will make + * other data dirty + * @budg_uncommitted_idx: amount of bytes were budgeted for growth of the index, + * but which still have to be taken into account because + * the index has not been committed so far + * @space_lock: protects @budg_idx_growth, @budg_data_growth, @budg_dd_growth, + * @budg_uncommited_idx, @min_idx_lebs, @old_idx_sz, @lst, + * @nospace, and @nospace_rp; + * @min_idx_lebs: minimum number of LEBs required for the index + * @old_idx_sz: size of index on flash + * @calc_idx_sz: temporary variable which is used to calculate new index size + * (contains accurate new index size at end of TNC commit start) + * @lst: lprops statistics + * @nospace: non-zero if the file-system does not have flash space (used as + * optimization) + * @nospace_rp: the same as @nospace, but additionally means that even reserved + * pool is full + * + * @page_budget: budget for a page + * @inode_budget: budget for an inode + * @dent_budget: budget for a directory entry + * + * @ref_node_alsz: size of the LEB reference node aligned to the min. flash + * I/O unit + * @mst_node_alsz: master node aligned size + * @min_idx_node_sz: minimum indexing node aligned on 8-bytes boundary + * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary + * @max_inode_sz: maximum possible inode size in bytes + * @max_znode_sz: size of znode in bytes + * + * @leb_overhead: how many bytes are wasted in an LEB when it is filled with + * data nodes of maximum size - used in free space reporting + * @dead_wm: LEB dead space watermark + * @dark_wm: LEB dark space watermark + * @block_cnt: count of 4KiB blocks on the FS + * + * @ranges: UBIFS node length ranges + * @ubi: UBI volume descriptor + * @di: UBI device information + * @vi: UBI volume information + * + * @orph_tree: rb-tree of orphan inode numbers + * @orph_list: list of orphan inode numbers in order added + * @orph_new: list of orphan inode numbers added since last commit + * @orph_cnext: next orphan to commit + * @orph_dnext: next orphan to delete + * @orphan_lock: lock for orph_tree and orph_new + * @orph_buf: buffer for orphan nodes + * @new_orphans: number of orphans since last commit + * @cmt_orphans: number of orphans being committed + * @tot_orphans: number of orphans in the rb_tree + * @max_orphans: maximum number of orphans allowed + * @ohead_lnum: orphan head LEB number + * @ohead_offs: orphan head offset + * @no_orphs: non-zero if there are no orphans + * + * @bgt: UBIFS background thread + * @bgt_name: background thread name + * @need_bgt: if background thread should run + * @need_wbuf_sync: if write-buffers have to be synchronized + * + * @gc_lnum: LEB number used for garbage collection + * @sbuf: a buffer of LEB size used by GC and replay for scanning + * @idx_gc: list of index LEBs that have been garbage collected + * @idx_gc_cnt: number of elements on the idx_gc list + * @gc_seq: incremented for every non-index LEB garbage collected + * @gced_lnum: last non-index LEB that was garbage collected + * + * @infos_list: links all 'ubifs_info' objects + * @umount_mutex: serializes shrinker and un-mount + * @shrinker_run_no: shrinker run number + * + * @space_bits: number of bits needed to record free or dirty space + * @lpt_lnum_bits: number of bits needed to record a LEB number in the LPT + * @lpt_offs_bits: number of bits needed to record an offset in the LPT + * @lpt_spc_bits: number of bits needed to space in the LPT + * @pcnt_bits: number of bits needed to record pnode or nnode number + * @lnum_bits: number of bits needed to record LEB number + * @nnode_sz: size of on-flash nnode + * @pnode_sz: size of on-flash pnode + * @ltab_sz: size of on-flash LPT lprops table + * @lsave_sz: size of on-flash LPT save table + * @pnode_cnt: number of pnodes + * @nnode_cnt: number of nnodes + * @lpt_hght: height of the LPT + * @pnodes_have: number of pnodes in memory + * + * @lp_mutex: protects lprops table and all the other lprops-related fields + * @lpt_lnum: LEB number of the root nnode of the LPT + * @lpt_offs: offset of the root nnode of the LPT + * @nhead_lnum: LEB number of LPT head + * @nhead_offs: offset of LPT head + * @lpt_drty_flgs: dirty flags for LPT special nodes e.g. ltab + * @dirty_nn_cnt: number of dirty nnodes + * @dirty_pn_cnt: number of dirty pnodes + * @check_lpt_free: flag that indicates LPT GC may be needed + * @lpt_sz: LPT size + * @lpt_nod_buf: buffer for an on-flash nnode or pnode + * @lpt_buf: buffer of LEB size used by LPT + * @nroot: address in memory of the root nnode of the LPT + * @lpt_cnext: next LPT node to commit + * @lpt_heap: array of heaps of categorized lprops + * @dirty_idx: a (reverse sorted) copy of the LPROPS_DIRTY_IDX heap as at + * previous commit start + * @uncat_list: list of un-categorized LEBs + * @empty_list: list of empty LEBs + * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size) + * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size) + * @freeable_cnt: number of freeable LEBs in @freeable_list + * + * @ltab_lnum: LEB number of LPT's own lprops table + * @ltab_offs: offset of LPT's own lprops table + * @ltab: LPT's own lprops table + * @ltab_cmt: LPT's own lprops table (commit copy) + * @lsave_cnt: number of LEB numbers in LPT's save table + * @lsave_lnum: LEB number of LPT's save table + * @lsave_offs: offset of LPT's save table + * @lsave: LPT's save table + * @lscan_lnum: LEB number of last LPT scan + * + * @rp_size: size of the reserved pool in bytes + * @report_rp_size: size of the reserved pool reported to user-space + * @rp_uid: reserved pool user ID + * @rp_gid: reserved pool group ID + * + * @empty: if the UBI device is empty + * @replay_tree: temporary tree used during journal replay + * @replay_list: temporary list used during journal replay + * @replay_buds: list of buds to replay + * @cs_sqnum: sequence number of first node in the log (commit start node) + * @replay_sqnum: sequence number of node currently being replayed + * @need_recovery: file-system needs recovery + * @replaying: set to %1 during journal replay + * @unclean_leb_list: LEBs to recover when mounting ro to rw + * @rcvrd_mst_node: recovered master node to write when mounting ro to rw + * @size_tree: inode size information for recovery + * @remounting_rw: set while remounting from ro to rw (sb flags have MS_RDONLY) + * @always_chk_crc: always check CRCs (while mounting and remounting rw) + * @mount_opts: UBIFS-specific mount options + * + * @dbg: debugging-related information + */ +struct ubifs_info { + struct super_block *vfs_sb; + struct backing_dev_info bdi; + + ino_t highest_inum; + unsigned long long max_sqnum; + unsigned long long cmt_no; + spinlock_t cnt_lock; + int fmt_version; + int ro_compat_version; + unsigned char uuid[16]; + + int lhead_lnum; + int lhead_offs; + int ltail_lnum; + struct mutex log_mutex; + int min_log_bytes; + long long cmt_bud_bytes; + + struct rb_root buds; + long long bud_bytes; + spinlock_t buds_lock; + int jhead_cnt; + struct ubifs_jhead *jheads; + long long max_bud_bytes; + long long bg_bud_bytes; + struct list_head old_buds; + int max_bud_cnt; + + struct rw_semaphore commit_sem; + int cmt_state; + spinlock_t cs_lock; + wait_queue_head_t cmt_wq; + + unsigned int big_lpt:1; + unsigned int no_chk_data_crc:1; + unsigned int bulk_read:1; + unsigned int default_compr:2; + unsigned int rw_incompat:1; + + struct mutex tnc_mutex; + struct ubifs_zbranch zroot; + struct ubifs_znode *cnext; + struct ubifs_znode *enext; + int *gap_lebs; + void *cbuf; + void *ileb_buf; + int ileb_len; + int ihead_lnum; + int ihead_offs; + int *ilebs; + int ileb_cnt; + int ileb_nxt; + struct rb_root old_idx; + int *bottom_up_buf; + + struct ubifs_mst_node *mst_node; + int mst_offs; + struct mutex mst_mutex; + + int max_bu_buf_len; + struct mutex bu_mutex; + struct bu_info bu; + + int log_lebs; + long long log_bytes; + int log_last; + int lpt_lebs; + int lpt_first; + int lpt_last; + int orph_lebs; + int orph_first; + int orph_last; + int main_lebs; + int main_first; + long long main_bytes; + + uint8_t key_hash_type; + uint32_t (*key_hash)(const char *str, int len); + int key_fmt; + int key_len; + int fanout; + + int min_io_size; + int min_io_shift; + int leb_size; + int half_leb_size; + int idx_leb_size; + int leb_cnt; + int max_leb_cnt; + int old_leb_cnt; + int ro_media; + + atomic_long_t dirty_pg_cnt; + atomic_long_t dirty_zn_cnt; + atomic_long_t clean_zn_cnt; + + long long budg_idx_growth; + long long budg_data_growth; + long long budg_dd_growth; + long long budg_uncommitted_idx; + spinlock_t space_lock; + int min_idx_lebs; + unsigned long long old_idx_sz; + unsigned long long calc_idx_sz; + struct ubifs_lp_stats lst; + unsigned int nospace:1; + unsigned int nospace_rp:1; + + int page_budget; + int inode_budget; + int dent_budget; + + int ref_node_alsz; + int mst_node_alsz; + int min_idx_node_sz; + int max_idx_node_sz; + long long max_inode_sz; + int max_znode_sz; + + int leb_overhead; + int dead_wm; + int dark_wm; + int block_cnt; + + struct ubifs_node_range ranges[UBIFS_NODE_TYPES_CNT]; + struct ubi_volume_desc *ubi; + struct ubi_device_info di; + struct ubi_volume_info vi; + + struct rb_root orph_tree; + struct list_head orph_list; + struct list_head orph_new; + struct ubifs_orphan *orph_cnext; + struct ubifs_orphan *orph_dnext; + spinlock_t orphan_lock; + void *orph_buf; + int new_orphans; + int cmt_orphans; + int tot_orphans; + int max_orphans; + int ohead_lnum; + int ohead_offs; + int no_orphs; + + struct task_struct *bgt; + char bgt_name[sizeof(BGT_NAME_PATTERN) + 9]; + int need_bgt; + int need_wbuf_sync; + + int gc_lnum; + void *sbuf; + struct list_head idx_gc; + int idx_gc_cnt; + int gc_seq; + int gced_lnum; + + struct list_head infos_list; + struct mutex umount_mutex; + unsigned int shrinker_run_no; + + int space_bits; + int lpt_lnum_bits; + int lpt_offs_bits; + int lpt_spc_bits; + int pcnt_bits; + int lnum_bits; + int nnode_sz; + int pnode_sz; + int ltab_sz; + int lsave_sz; + int pnode_cnt; + int nnode_cnt; + int lpt_hght; + int pnodes_have; + + struct mutex lp_mutex; + int lpt_lnum; + int lpt_offs; + int nhead_lnum; + int nhead_offs; + int lpt_drty_flgs; + int dirty_nn_cnt; + int dirty_pn_cnt; + int check_lpt_free; + long long lpt_sz; + void *lpt_nod_buf; + void *lpt_buf; + struct ubifs_nnode *nroot; + struct ubifs_cnode *lpt_cnext; + struct ubifs_lpt_heap lpt_heap[LPROPS_HEAP_CNT]; + struct ubifs_lpt_heap dirty_idx; + struct list_head uncat_list; + struct list_head empty_list; + struct list_head freeable_list; + struct list_head frdi_idx_list; + int freeable_cnt; + + int ltab_lnum; + int ltab_offs; + struct ubifs_lpt_lprops *ltab; + struct ubifs_lpt_lprops *ltab_cmt; + int lsave_cnt; + int lsave_lnum; + int lsave_offs; + int *lsave; + int lscan_lnum; + + long long rp_size; + long long report_rp_size; + uid_t rp_uid; + gid_t rp_gid; + + /* The below fields are used only during mounting and re-mounting */ + int empty; + struct rb_root replay_tree; + struct list_head replay_list; + struct list_head replay_buds; + unsigned long long cs_sqnum; + unsigned long long replay_sqnum; + int need_recovery; + int replaying; + struct list_head unclean_leb_list; + struct ubifs_mst_node *rcvrd_mst_node; + struct rb_root size_tree; + int remounting_rw; + int always_chk_crc; + struct ubifs_mount_opts mount_opts; + +#ifdef CONFIG_UBIFS_FS_DEBUG + struct ubifs_debug_info *dbg; +#endif +}; + +extern struct list_head ubifs_infos; +extern spinlock_t ubifs_infos_lock; +extern atomic_long_t ubifs_clean_zn_cnt; +extern struct kmem_cache *ubifs_inode_slab; +extern const struct super_operations ubifs_super_operations; +extern const struct address_space_operations ubifs_file_address_operations; +extern const struct file_operations ubifs_file_operations; +extern const struct inode_operations ubifs_file_inode_operations; +extern const struct file_operations ubifs_dir_operations; +extern const struct inode_operations ubifs_dir_inode_operations; +extern const struct inode_operations ubifs_symlink_inode_operations; +extern struct backing_dev_info ubifs_backing_dev_info; +extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; + +/* io.c */ +void ubifs_ro_mode(struct ubifs_info *c, int err); +int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len); +int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs, + int dtype); +int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf); +int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len, + int lnum, int offs); +int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, + int lnum, int offs); +int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum, + int offs, int dtype); +int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, + int offs, int quiet, int must_chk_crc); +void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad); +void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last); +int ubifs_io_init(struct ubifs_info *c); +void ubifs_pad(const struct ubifs_info *c, void *buf, int pad); +int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf); +int ubifs_bg_wbufs_sync(struct ubifs_info *c); +void ubifs_wbuf_add_ino_nolock(struct ubifs_wbuf *wbuf, ino_t inum); +int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode); + +/* scan.c */ +struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, + int offs, void *sbuf); +void ubifs_scan_destroy(struct ubifs_scan_leb *sleb); +int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, + int offs, int quiet); +struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, + int offs, void *sbuf); +void ubifs_end_scan(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, + int lnum, int offs); +int ubifs_add_snod(const struct ubifs_info *c, struct ubifs_scan_leb *sleb, + void *buf, int offs); +void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, + void *buf); + +/* log.c */ +void ubifs_add_bud(struct ubifs_info *c, struct ubifs_bud *bud); +void ubifs_create_buds_lists(struct ubifs_info *c); +int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs); +struct ubifs_bud *ubifs_search_bud(struct ubifs_info *c, int lnum); +struct ubifs_wbuf *ubifs_get_wbuf(struct ubifs_info *c, int lnum); +int ubifs_log_start_commit(struct ubifs_info *c, int *ltail_lnum); +int ubifs_log_end_commit(struct ubifs_info *c, int new_ltail_lnum); +int ubifs_log_post_commit(struct ubifs_info *c, int old_ltail_lnum); +int ubifs_consolidate_log(struct ubifs_info *c); + +/* journal.c */ +int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, + const struct qstr *nm, const struct inode *inode, + int deletion, int xent); +int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, + const union ubifs_key *key, const void *buf, int len); +int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); +int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode); +int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, + const struct dentry *old_dentry, + const struct inode *new_dir, + const struct dentry *new_dentry, int sync); +int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, + loff_t old_size, loff_t new_size); +int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, + const struct inode *inode, const struct qstr *nm); +int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode1, + const struct inode *inode2); + +/* budget.c */ +int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req); +void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req); +void ubifs_release_dirty_inode_budget(struct ubifs_info *c, + struct ubifs_inode *ui); +int ubifs_budget_inode_op(struct ubifs_info *c, struct inode *inode, + struct ubifs_budget_req *req); +void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode, + struct ubifs_budget_req *req); +void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, + struct ubifs_budget_req *req); +long long ubifs_get_free_space(struct ubifs_info *c); +long long ubifs_get_free_space_nolock(struct ubifs_info *c); +int ubifs_calc_min_idx_lebs(struct ubifs_info *c); +void ubifs_convert_page_budget(struct ubifs_info *c); +long long ubifs_reported_space(const struct ubifs_info *c, long long free); +long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); + +/* find.c */ +int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs, + int squeeze); +int ubifs_find_free_leb_for_idx(struct ubifs_info *c); +int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, + int min_space, int pick_free); +int ubifs_find_dirty_idx_leb(struct ubifs_info *c); +int ubifs_save_dirty_idx_lnums(struct ubifs_info *c); + +/* tnc.c */ +int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_znode **zn, int *n); +int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, + void *node, const struct qstr *nm); +int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, + void *node, int *lnum, int *offs); +int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, + int offs, int len); +int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key, + int old_lnum, int old_offs, int lnum, int offs, int len); +int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, + int lnum, int offs, int len, const struct qstr *nm); +int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key); +int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, + const struct qstr *nm); +int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key, + union ubifs_key *to_key); +int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum); +struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, + union ubifs_key *key, + const struct qstr *nm); +void ubifs_tnc_close(struct ubifs_info *c); +int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs, int is_idx); +int ubifs_dirty_idx_node(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs); +/* Shared by tnc.c for tnc_commit.c */ +void destroy_old_idx(struct ubifs_info *c); +int is_idx_node_in_tnc(struct ubifs_info *c, union ubifs_key *key, int level, + int lnum, int offs); +int insert_old_idx_znode(struct ubifs_info *c, struct ubifs_znode *znode); +int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu); +int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu); + +/* tnc_misc.c */ +struct ubifs_znode *ubifs_tnc_levelorder_next(struct ubifs_znode *zr, + struct ubifs_znode *znode); +int ubifs_search_zbranch(const struct ubifs_info *c, + const struct ubifs_znode *znode, + const union ubifs_key *key, int *n); +struct ubifs_znode *ubifs_tnc_postorder_first(struct ubifs_znode *znode); +struct ubifs_znode *ubifs_tnc_postorder_next(struct ubifs_znode *znode); +long ubifs_destroy_tnc_subtree(struct ubifs_znode *zr); +struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c, + struct ubifs_zbranch *zbr, + struct ubifs_znode *parent, int iip); +int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *node); + +/* tnc_commit.c */ +int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot); +int ubifs_tnc_end_commit(struct ubifs_info *c); + +/* shrinker.c */ +int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask); + +/* commit.c */ +int ubifs_bg_thread(void *info); +void ubifs_commit_required(struct ubifs_info *c); +void ubifs_request_bg_commit(struct ubifs_info *c); +int ubifs_run_commit(struct ubifs_info *c); +void ubifs_recovery_commit(struct ubifs_info *c); +int ubifs_gc_should_commit(struct ubifs_info *c); +void ubifs_wait_for_commit(struct ubifs_info *c); + +/* master.c */ +int ubifs_read_master(struct ubifs_info *c); +int ubifs_write_master(struct ubifs_info *c); + +/* sb.c */ +int ubifs_read_superblock(struct ubifs_info *c); +struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c); +int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup); + +/* replay.c */ +int ubifs_validate_entry(struct ubifs_info *c, + const struct ubifs_dent_node *dent); +int ubifs_replay_journal(struct ubifs_info *c); + +/* gc.c */ +int ubifs_garbage_collect(struct ubifs_info *c, int anyway); +int ubifs_gc_start_commit(struct ubifs_info *c); +int ubifs_gc_end_commit(struct ubifs_info *c); +void ubifs_destroy_idx_gc(struct ubifs_info *c); +int ubifs_get_idx_gc_leb(struct ubifs_info *c); +int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp); + +/* orphan.c */ +int ubifs_add_orphan(struct ubifs_info *c, ino_t inum); +void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum); +int ubifs_orphan_start_commit(struct ubifs_info *c); +int ubifs_orphan_end_commit(struct ubifs_info *c); +int ubifs_mount_orphans(struct ubifs_info *c, int unclean, int read_only); +int ubifs_clear_orphans(struct ubifs_info *c); + +/* lpt.c */ +int ubifs_calc_lpt_geom(struct ubifs_info *c); +int ubifs_create_dflt_lpt(struct ubifs_info *c, int *main_lebs, int lpt_first, + int *lpt_lebs, int *big_lpt); +int ubifs_lpt_init(struct ubifs_info *c, int rd, int wr); +struct ubifs_lprops *ubifs_lpt_lookup(struct ubifs_info *c, int lnum); +struct ubifs_lprops *ubifs_lpt_lookup_dirty(struct ubifs_info *c, int lnum); +int ubifs_lpt_scan_nolock(struct ubifs_info *c, int start_lnum, int end_lnum, + ubifs_lpt_scan_callback scan_cb, void *data); + +/* Shared by lpt.c for lpt_commit.c */ +void ubifs_pack_lsave(struct ubifs_info *c, void *buf, int *lsave); +void ubifs_pack_ltab(struct ubifs_info *c, void *buf, + struct ubifs_lpt_lprops *ltab); +void ubifs_pack_pnode(struct ubifs_info *c, void *buf, + struct ubifs_pnode *pnode); +void ubifs_pack_nnode(struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode); +struct ubifs_pnode *ubifs_get_pnode(struct ubifs_info *c, + struct ubifs_nnode *parent, int iip); +struct ubifs_nnode *ubifs_get_nnode(struct ubifs_info *c, + struct ubifs_nnode *parent, int iip); +int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip); +void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty); +void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode); +uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits); +struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght); +/* Needed only in debugging code in lpt_commit.c */ +int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf, + struct ubifs_nnode *nnode); + +/* lpt_commit.c */ +int ubifs_lpt_start_commit(struct ubifs_info *c); +int ubifs_lpt_end_commit(struct ubifs_info *c); +int ubifs_lpt_post_commit(struct ubifs_info *c); +void ubifs_lpt_free(struct ubifs_info *c, int wr_only); + +/* lprops.c */ +const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c, + const struct ubifs_lprops *lp, + int free, int dirty, int flags, + int idx_gc_cnt); +void ubifs_get_lp_stats(struct ubifs_info *c, struct ubifs_lp_stats *lst); +void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops, + int cat); +void ubifs_replace_cat(struct ubifs_info *c, struct ubifs_lprops *old_lprops, + struct ubifs_lprops *new_lprops); +void ubifs_ensure_cat(struct ubifs_info *c, struct ubifs_lprops *lprops); +int ubifs_categorize_lprops(const struct ubifs_info *c, + const struct ubifs_lprops *lprops); +int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, + int flags_set, int flags_clean, int idx_gc_cnt); +int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, + int flags_set, int flags_clean); +int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp); +const struct ubifs_lprops *ubifs_fast_find_free(struct ubifs_info *c); +const struct ubifs_lprops *ubifs_fast_find_empty(struct ubifs_info *c); +const struct ubifs_lprops *ubifs_fast_find_freeable(struct ubifs_info *c); +const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c); + +/* file.c */ +int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync); +int ubifs_setattr(struct dentry *dentry, struct iattr *attr); + +/* dir.c */ +struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, + int mode); +int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); + +/* xattr.c */ +int ubifs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t size); +ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size); +int ubifs_removexattr(struct dentry *dentry, const char *name); + +/* super.c */ +struct inode *ubifs_iget(struct super_block *sb, unsigned long inum); + +/* recovery.c */ +int ubifs_recover_master_node(struct ubifs_info *c); +int ubifs_write_rcvrd_mst_node(struct ubifs_info *c); +struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, + int offs, void *sbuf, int grouped); +struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, + int offs, void *sbuf); +int ubifs_recover_inl_heads(const struct ubifs_info *c, void *sbuf); +int ubifs_clean_lebs(const struct ubifs_info *c, void *sbuf); +int ubifs_rcvry_gc_commit(struct ubifs_info *c); +int ubifs_recover_size_accum(struct ubifs_info *c, union ubifs_key *key, + int deletion, loff_t new_size); +int ubifs_recover_size(struct ubifs_info *c); +void ubifs_destroy_size_tree(struct ubifs_info *c); + +/* ioctl.c */ +long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +void ubifs_set_inode_flags(struct inode *inode); +#ifdef CONFIG_COMPAT +long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +#endif + +/* compressor.c */ +int __init ubifs_compressors_init(void); +void ubifs_compressors_exit(void); +void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, + int *compr_type); +int ubifs_decompress(const void *buf, int len, void *out, int *out_len, + int compr_type); + +#include "debug.h" +#include "misc.h" +#include "key.h" + +#endif /* !__UBIFS_H__ */ diff -Nurd linux-2.6.24/fs/ubifs/xattr.c ubifs-v2.6.24/fs/ubifs/xattr.c --- linux-2.6.24/fs/ubifs/xattr.c 1970-01-01 02:00:00.000000000 +0200 +++ ubifs-v2.6.24/fs/ubifs/xattr.c 2009-04-07 17:14:47.000000000 +0200 @@ -0,0 +1,571 @@ +/* + * This file is part of UBIFS. + * + * Copyright (C) 2006-2008 Nokia Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 51 + * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: Artem Bityutskiy (Битюцкий Артём) + * Adrian Hunter + */ + +/* + * This file implements UBIFS extended attributes support. + * + * Extended attributes are implemented as regular inodes with attached data, + * which limits extended attribute size to UBIFS block size (4KiB). Names of + * extended attributes are described by extended attribute entries (xentries), + * which are almost identical to directory entries, but have different key type. + * + * In other words, the situation with extended attributes is very similar to + * directories. Indeed, any inode (but of course not xattr inodes) may have a + * number of associated xentries, just like directory inodes have associated + * directory entries. Extended attribute entries store the name of the extended + * attribute, the host inode number, and the extended attribute inode number. + * Similarly, direntries store the name, the parent and the target inode + * numbers. Thus, most of the common UBIFS mechanisms may be re-used for + * extended attributes. + * + * The number of extended attributes is not limited, but there is Linux + * limitation on the maximum possible size of the list of all extended + * attributes associated with an inode (%XATTR_LIST_MAX), so UBIFS makes sure + * the sum of all extended attribute names of the inode does not exceed that + * limit. + * + * Extended attributes are synchronous, which means they are written to the + * flash media synchronously and there is no write-back for extended attribute + * inodes. The extended attribute values are not stored in compressed form on + * the media. + * + * Since extended attributes are represented by regular inodes, they are cached + * in the VFS inode cache. The xentries are cached in the LNC cache (see + * tnc.c). + * + * ACL support is not implemented. + */ + +#include +#include +#include "ubifs.h" + +/* + * Limit the number of extended attributes per inode so that the total size + * (@xattr_size) is guaranteeded to fit in an 'unsigned int'. + */ +#define MAX_XATTRS_PER_INODE 65535 + +/* + * Extended attribute type constants. + * + * USER_XATTR: user extended attribute ("user.*") + * TRUSTED_XATTR: trusted extended attribute ("trusted.*) + * SECURITY_XATTR: security extended attribute ("security.*") + */ +enum { + USER_XATTR, + TRUSTED_XATTR, + SECURITY_XATTR, +}; + +static struct inode_operations none_inode_operations; +static struct address_space_operations none_address_operations; +static struct file_operations none_file_operations; + +/** + * create_xattr - create an extended attribute. + * @c: UBIFS file-system description object + * @host: host inode + * @nm: extended attribute name + * @value: extended attribute value + * @size: size of extended attribute value + * + * This is a helper function which creates an extended attribute of name @nm + * and value @value for inode @host. The host inode is also updated on flash + * because the ctime and extended attribute accounting data changes. This + * function returns zero in case of success and a negative error code in case + * of failure. + */ +static int create_xattr(struct ubifs_info *c, struct inode *host, + const struct qstr *nm, const void *value, int size) +{ + int err; + struct inode *inode; + struct ubifs_inode *ui, *host_ui = ubifs_inode(host); + struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, + .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1, + .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; + + if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) + return -ENOSPC; + /* + * Linux limits the maximum size of the extended attribute names list + * to %XATTR_LIST_MAX. This means we should not allow creating more + * extended attributes if the name list becomes larger. This limitation + * is artificial for UBIFS, though. + */ + if (host_ui->xattr_names + host_ui->xattr_cnt + + nm->len + 1 > XATTR_LIST_MAX) + return -ENOSPC; + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + inode = ubifs_new_inode(c, host, S_IFREG | S_IRWXUGO); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_budg; + } + + /* Re-define all operations to be "nothing" */ + inode->i_mapping->a_ops = &none_address_operations; + inode->i_op = &none_inode_operations; + inode->i_fop = &none_file_operations; + + inode->i_flags |= S_SYNC | S_NOATIME | S_NOCMTIME | S_NOQUOTA; + ui = ubifs_inode(inode); + ui->xattr = 1; + ui->flags |= UBIFS_XATTR_FL; + ui->data = kmalloc(size, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_free; + } + memcpy(ui->data, value, size); + inode->i_size = ui->ui_size = size; + ui->data_len = size; + + mutex_lock(&host_ui->ui_mutex); + host->i_ctime = ubifs_current_time(host); + host_ui->xattr_cnt += 1; + host_ui->xattr_size += CALC_DENT_SIZE(nm->len); + host_ui->xattr_size += CALC_XATTR_BYTES(size); + host_ui->xattr_names += nm->len; + + err = ubifs_jnl_update(c, host, nm, inode, 0, 1); + if (err) + goto out_cancel; + mutex_unlock(&host_ui->ui_mutex); + + ubifs_release_budget(c, &req); + insert_inode_hash(inode); + iput(inode); + return 0; + +out_cancel: + host_ui->xattr_cnt -= 1; + host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); + host_ui->xattr_size -= CALC_XATTR_BYTES(size); + mutex_unlock(&host_ui->ui_mutex); +out_free: + make_bad_inode(inode); + iput(inode); +out_budg: + ubifs_release_budget(c, &req); + return err; +} + +/** + * change_xattr - change an extended attribute. + * @c: UBIFS file-system description object + * @host: host inode + * @inode: extended attribute inode + * @value: extended attribute value + * @size: size of extended attribute value + * + * This helper function changes the value of extended attribute @inode with new + * data from @value. Returns zero in case of success and a negative error code + * in case of failure. + */ +static int change_xattr(struct ubifs_info *c, struct inode *host, + struct inode *inode, const void *value, int size) +{ + int err; + struct ubifs_inode *host_ui = ubifs_inode(host); + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_budget_req req = { .dirtied_ino = 2, + .dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) }; + + ubifs_assert(ui->data_len == inode->i_size); + err = ubifs_budget_space(c, &req); + if (err) + return err; + + kfree(ui->data); + ui->data = kmalloc(size, GFP_NOFS); + if (!ui->data) { + err = -ENOMEM; + goto out_free; + } + memcpy(ui->data, value, size); + inode->i_size = ui->ui_size = size; + ui->data_len = size; + + mutex_lock(&host_ui->ui_mutex); + host->i_ctime = ubifs_current_time(host); + host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); + host_ui->xattr_size += CALC_XATTR_BYTES(size); + + /* + * It is important to write the host inode after the xattr inode + * because if the host inode gets synchronized (via 'fsync()'), then + * the extended attribute inode gets synchronized, because it goes + * before the host inode in the write-buffer. + */ + err = ubifs_jnl_change_xattr(c, inode, host); + if (err) + goto out_cancel; + mutex_unlock(&host_ui->ui_mutex); + + ubifs_release_budget(c, &req); + return 0; + +out_cancel: + host_ui->xattr_size -= CALC_XATTR_BYTES(size); + host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); + mutex_unlock(&host_ui->ui_mutex); + make_bad_inode(inode); +out_free: + ubifs_release_budget(c, &req); + return err; +} + +/** + * check_namespace - check extended attribute name-space. + * @nm: extended attribute name + * + * This function makes sure the extended attribute name belongs to one of the + * supported extended attribute name-spaces. Returns name-space index in case + * of success and a negative error code in case of failure. + */ +static int check_namespace(const struct qstr *nm) +{ + int type; + + if (nm->len > UBIFS_MAX_NLEN) + return -ENAMETOOLONG; + + if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX, + XATTR_TRUSTED_PREFIX_LEN)) { + if (nm->name[sizeof(XATTR_TRUSTED_PREFIX) - 1] == '\0') + return -EINVAL; + type = TRUSTED_XATTR; + } else if (!strncmp(nm->name, XATTR_USER_PREFIX, + XATTR_USER_PREFIX_LEN)) { + if (nm->name[XATTR_USER_PREFIX_LEN] == '\0') + return -EINVAL; + type = USER_XATTR; + } else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN)) { + if (nm->name[sizeof(XATTR_SECURITY_PREFIX) - 1] == '\0') + return -EINVAL; + type = SECURITY_XATTR; + } else + return -EOPNOTSUPP; + + return type; +} + +static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum) +{ + struct inode *inode; + + inode = ubifs_iget(c->vfs_sb, inum); + if (IS_ERR(inode)) { + ubifs_err("dead extended attribute entry, error %d", + (int)PTR_ERR(inode)); + return inode; + } + if (ubifs_inode(inode)->xattr) + return inode; + ubifs_err("corrupt extended attribute entry"); + iput(inode); + return ERR_PTR(-EINVAL); +} + +int ubifs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + struct inode *inode, *host = dentry->d_inode; + struct ubifs_info *c = host->i_sb->s_fs_info; + struct qstr nm = { .name = name, .len = strlen(name) }; + struct ubifs_dent_node *xent; + union ubifs_key key; + int err, type; + + dbg_gen("xattr '%s', host ino %lu ('%.*s'), size %zd", name, + host->i_ino, dentry->d_name.len, dentry->d_name.name, size); + ubifs_assert(mutex_is_locked(&host->i_mutex)); + + if (size > UBIFS_MAX_INO_DATA) + return -ERANGE; + + type = check_namespace(&nm); + if (type < 0) + return type; + + xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); + if (!xent) + return -ENOMEM; + + /* + * The extended attribute entries are stored in LNC, so multiple + * look-ups do not involve reading the flash. + */ + xent_key_init(c, &key, host->i_ino, &nm); + err = ubifs_tnc_lookup_nm(c, &key, xent, &nm); + if (err) { + if (err != -ENOENT) + goto out_free; + + if (flags & XATTR_REPLACE) + /* We are asked not to create the xattr */ + err = -ENODATA; + else + err = create_xattr(c, host, &nm, value, size); + goto out_free; + } + + if (flags & XATTR_CREATE) { + /* We are asked not to replace the xattr */ + err = -EEXIST; + goto out_free; + } + + inode = iget_xattr(c, le64_to_cpu(xent->inum)); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_free; + } + + err = change_xattr(c, host, inode, value, size); + iput(inode); + +out_free: + kfree(xent); + return err; +} + +ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, + size_t size) +{ + struct inode *inode, *host = dentry->d_inode; + struct ubifs_info *c = host->i_sb->s_fs_info; + struct qstr nm = { .name = name, .len = strlen(name) }; + struct ubifs_inode *ui; + struct ubifs_dent_node *xent; + union ubifs_key key; + int err; + + dbg_gen("xattr '%s', ino %lu ('%.*s'), buf size %zd", name, + host->i_ino, dentry->d_name.len, dentry->d_name.name, size); + + err = check_namespace(&nm); + if (err < 0) + return err; + + xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); + if (!xent) + return -ENOMEM; + + xent_key_init(c, &key, host->i_ino, &nm); + err = ubifs_tnc_lookup_nm(c, &key, xent, &nm); + if (err) { + if (err == -ENOENT) + err = -ENODATA; + goto out_unlock; + } + + inode = iget_xattr(c, le64_to_cpu(xent->inum)); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_unlock; + } + + ui = ubifs_inode(inode); + ubifs_assert(inode->i_size == ui->data_len); + ubifs_assert(ubifs_inode(host)->xattr_size > ui->data_len); + + if (buf) { + /* If @buf is %NULL we are supposed to return the length */ + if (ui->data_len > size) { + dbg_err("buffer size %zd, xattr len %d", + size, ui->data_len); + err = -ERANGE; + goto out_iput; + } + + memcpy(buf, ui->data, ui->data_len); + } + err = ui->data_len; + +out_iput: + iput(inode); +out_unlock: + kfree(xent); + return err; +} + +ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + union ubifs_key key; + struct inode *host = dentry->d_inode; + struct ubifs_info *c = host->i_sb->s_fs_info; + struct ubifs_inode *host_ui = ubifs_inode(host); + struct ubifs_dent_node *xent, *pxent = NULL; + int err, len, written = 0; + struct qstr nm = { .name = NULL }; + + dbg_gen("ino %lu ('%.*s'), buffer size %zd", host->i_ino, + dentry->d_name.len, dentry->d_name.name, size); + + len = host_ui->xattr_names + host_ui->xattr_cnt; + if (!buffer) + /* + * We should return the minimum buffer size which will fit a + * null-terminated list of all the extended attribute names. + */ + return len; + + if (len > size) + return -ERANGE; + + lowest_xent_key(c, &key, host->i_ino); + while (1) { + int type; + + xent = ubifs_tnc_next_ent(c, &key, &nm); + if (IS_ERR(xent)) { + err = PTR_ERR(xent); + break; + } + + nm.name = xent->name; + nm.len = le16_to_cpu(xent->nlen); + + type = check_namespace(&nm); + if (unlikely(type < 0)) { + err = type; + break; + } + + /* Show trusted namespace only for "power" users */ + if (type != TRUSTED_XATTR || capable(CAP_SYS_ADMIN)) { + memcpy(buffer + written, nm.name, nm.len + 1); + written += nm.len + 1; + } + + kfree(pxent); + pxent = xent; + key_read(c, &xent->key, &key); + } + + kfree(pxent); + if (err != -ENOENT) { + ubifs_err("cannot find next direntry, error %d", err); + return err; + } + + ubifs_assert(written <= size); + return written; +} + +static int remove_xattr(struct ubifs_info *c, struct inode *host, + struct inode *inode, const struct qstr *nm) +{ + int err; + struct ubifs_inode *host_ui = ubifs_inode(host); + struct ubifs_inode *ui = ubifs_inode(inode); + struct ubifs_budget_req req = { .dirtied_ino = 2, .mod_dent = 1, + .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; + + ubifs_assert(ui->data_len == inode->i_size); + + err = ubifs_budget_space(c, &req); + if (err) + return err; + + mutex_lock(&host_ui->ui_mutex); + host->i_ctime = ubifs_current_time(host); + host_ui->xattr_cnt -= 1; + host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); + host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); + host_ui->xattr_names -= nm->len; + + err = ubifs_jnl_delete_xattr(c, host, inode, nm); + if (err) + goto out_cancel; + mutex_unlock(&host_ui->ui_mutex); + + ubifs_release_budget(c, &req); + return 0; + +out_cancel: + host_ui->xattr_cnt += 1; + host_ui->xattr_size += CALC_DENT_SIZE(nm->len); + host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); + mutex_unlock(&host_ui->ui_mutex); + ubifs_release_budget(c, &req); + make_bad_inode(inode); + return err; +} + +int ubifs_removexattr(struct dentry *dentry, const char *name) +{ + struct inode *inode, *host = dentry->d_inode; + struct ubifs_info *c = host->i_sb->s_fs_info; + struct qstr nm = { .name = name, .len = strlen(name) }; + struct ubifs_dent_node *xent; + union ubifs_key key; + int err; + + dbg_gen("xattr '%s', ino %lu ('%.*s')", name, + host->i_ino, dentry->d_name.len, dentry->d_name.name); + ubifs_assert(mutex_is_locked(&host->i_mutex)); + + err = check_namespace(&nm); + if (err < 0) + return err; + + xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); + if (!xent) + return -ENOMEM; + + xent_key_init(c, &key, host->i_ino, &nm); + err = ubifs_tnc_lookup_nm(c, &key, xent, &nm); + if (err) { + if (err == -ENOENT) + err = -ENODATA; + goto out_free; + } + + inode = iget_xattr(c, le64_to_cpu(xent->inum)); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_free; + } + + ubifs_assert(inode->i_nlink == 1); + inode->i_nlink = 0; + err = remove_xattr(c, host, inode, &nm); + if (err) + inode->i_nlink = 1; + + /* If @i_nlink is 0, 'iput()' will delete the inode */ + iput(inode); + +out_free: + kfree(xent); + return err; +} diff -Nurd linux-2.6.24/include/linux/fs.h ubifs-v2.6.24/include/linux/fs.h --- linux-2.6.24/include/linux/fs.h 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/include/linux/fs.h 2009-04-07 17:14:47.000000000 +0200 @@ -1670,6 +1670,8 @@ extern int invalidate_inode_pages2(struct address_space *mapping); extern int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end); +extern void generic_sync_sb_inodes(struct super_block *sb, + struct writeback_control *wbc); extern int write_inode_now(struct inode *, int); extern int filemap_fdatawrite(struct address_space *); extern int filemap_flush(struct address_space *); diff -Nurd linux-2.6.24/include/linux/mtd/ubi.h ubifs-v2.6.24/include/linux/mtd/ubi.h --- linux-2.6.24/include/linux/mtd/ubi.h 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/include/linux/mtd/ubi.h 2009-04-07 17:14:47.000000000 +0200 @@ -26,23 +26,6 @@ #include /* - * UBI data type hint constants. - * - * UBI_LONGTERM: long-term data - * UBI_SHORTTERM: short-term data - * UBI_UNKNOWN: data persistence is unknown - * - * These constants are used when data is written to UBI volumes in order to - * help the UBI wear-leveling unit to find more appropriate physical - * eraseblocks. - */ -enum { - UBI_LONGTERM = 1, - UBI_SHORTTERM, - UBI_UNKNOWN -}; - -/* * enum ubi_open_mode - UBI volume open mode constants. * * UBI_READONLY: read-only mode @@ -62,13 +45,13 @@ * @size: how many physical eraseblocks are reserved for this volume * @used_bytes: how many bytes of data this volume contains * @used_ebs: how many physical eraseblocks of this volume actually contain any - * data + * data * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME) * @corrupted: non-zero if the volume is corrupted (static volumes only) * @upd_marker: non-zero if the volume has update marker set * @alignment: volume alignment * @usable_leb_size: how many bytes are available in logical eraseblocks of - * this volume + * this volume * @name_len: volume name length * @name: volume name * @cdev: UBI volume character device major and minor numbers @@ -167,7 +150,9 @@ int len, int dtype); int ubi_leb_erase(struct ubi_volume_desc *desc, int lnum); int ubi_leb_unmap(struct ubi_volume_desc *desc, int lnum); +int ubi_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype); int ubi_is_mapped(struct ubi_volume_desc *desc, int lnum); +int ubi_sync(int ubi_num); /* * This function is the same as the 'ubi_leb_read()' function, but it does not diff -Nurd linux-2.6.24/include/mtd/Kbuild ubifs-v2.6.24/include/mtd/Kbuild --- linux-2.6.24/include/mtd/Kbuild 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/include/mtd/Kbuild 2009-04-07 17:14:47.000000000 +0200 @@ -3,5 +3,4 @@ header-y += mtd-abi.h header-y += mtd-user.h header-y += nftl-user.h -header-y += ubi-header.h header-y += ubi-user.h diff -Nurd linux-2.6.24/include/mtd/ubi-header.h ubifs-v2.6.24/include/mtd/ubi-header.h --- linux-2.6.24/include/mtd/ubi-header.h 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/include/mtd/ubi-header.h 1970-01-01 02:00:00.000000000 +0200 @@ -1,331 +0,0 @@ -/* - * Copyright (c) International Business Machines Corp., 2006 - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * Authors: Artem Bityutskiy (Битюцкий Артём) - * Thomas Gleixner - * Frank Haverkamp - * Oliver Lohmann - * Andreas Arnez - */ - -/* - * This file defines the layout of UBI headers and all the other UBI on-flash - * data structures. May be included by user-space. - */ - -#ifndef __UBI_HEADER_H__ -#define __UBI_HEADER_H__ - -#include - -/* The version of UBI images supported by this implementation */ -#define UBI_VERSION 1 - -/* The highest erase counter value supported by this implementation */ -#define UBI_MAX_ERASECOUNTER 0x7FFFFFFF - -/* The initial CRC32 value used when calculating CRC checksums */ -#define UBI_CRC32_INIT 0xFFFFFFFFU - -/* Erase counter header magic number (ASCII "UBI#") */ -#define UBI_EC_HDR_MAGIC 0x55424923 -/* Volume identifier header magic number (ASCII "UBI!") */ -#define UBI_VID_HDR_MAGIC 0x55424921 - -/* - * Volume type constants used in the volume identifier header. - * - * @UBI_VID_DYNAMIC: dynamic volume - * @UBI_VID_STATIC: static volume - */ -enum { - UBI_VID_DYNAMIC = 1, - UBI_VID_STATIC = 2 -}; - -/* - * Compatibility constants used by internal volumes. - * - * @UBI_COMPAT_DELETE: delete this internal volume before anything is written - * to the flash - * @UBI_COMPAT_RO: attach this device in read-only mode - * @UBI_COMPAT_PRESERVE: preserve this internal volume - do not touch its - * physical eraseblocks, don't allow the wear-leveling unit to move them - * @UBI_COMPAT_REJECT: reject this UBI image - */ -enum { - UBI_COMPAT_DELETE = 1, - UBI_COMPAT_RO = 2, - UBI_COMPAT_PRESERVE = 4, - UBI_COMPAT_REJECT = 5 -}; - -/* Sizes of UBI headers */ -#define UBI_EC_HDR_SIZE sizeof(struct ubi_ec_hdr) -#define UBI_VID_HDR_SIZE sizeof(struct ubi_vid_hdr) - -/* Sizes of UBI headers without the ending CRC */ -#define UBI_EC_HDR_SIZE_CRC (UBI_EC_HDR_SIZE - sizeof(__be32)) -#define UBI_VID_HDR_SIZE_CRC (UBI_VID_HDR_SIZE - sizeof(__be32)) - -/** - * struct ubi_ec_hdr - UBI erase counter header. - * @magic: erase counter header magic number (%UBI_EC_HDR_MAGIC) - * @version: version of UBI implementation which is supposed to accept this - * UBI image - * @padding1: reserved for future, zeroes - * @ec: the erase counter - * @vid_hdr_offset: where the VID header starts - * @data_offset: where the user data start - * @padding2: reserved for future, zeroes - * @hdr_crc: erase counter header CRC checksum - * - * The erase counter header takes 64 bytes and has a plenty of unused space for - * future usage. The unused fields are zeroed. The @version field is used to - * indicate the version of UBI implementation which is supposed to be able to - * work with this UBI image. If @version is greater then the current UBI - * version, the image is rejected. This may be useful in future if something - * is changed radically. This field is duplicated in the volume identifier - * header. - * - * The @vid_hdr_offset and @data_offset fields contain the offset of the the - * volume identifier header and user data, relative to the beginning of the - * physical eraseblock. These values have to be the same for all physical - * eraseblocks. - */ -struct ubi_ec_hdr { - __be32 magic; - __u8 version; - __u8 padding1[3]; - __be64 ec; /* Warning: the current limit is 31-bit anyway! */ - __be32 vid_hdr_offset; - __be32 data_offset; - __u8 padding2[36]; - __be32 hdr_crc; -} __attribute__ ((packed)); - -/** - * struct ubi_vid_hdr - on-flash UBI volume identifier header. - * @magic: volume identifier header magic number (%UBI_VID_HDR_MAGIC) - * @version: UBI implementation version which is supposed to accept this UBI - * image (%UBI_VERSION) - * @vol_type: volume type (%UBI_VID_DYNAMIC or %UBI_VID_STATIC) - * @copy_flag: if this logical eraseblock was copied from another physical - * eraseblock (for wear-leveling reasons) - * @compat: compatibility of this volume (%0, %UBI_COMPAT_DELETE, - * %UBI_COMPAT_IGNORE, %UBI_COMPAT_PRESERVE, or %UBI_COMPAT_REJECT) - * @vol_id: ID of this volume - * @lnum: logical eraseblock number - * @leb_ver: version of this logical eraseblock (IMPORTANT: obsolete, to be - * removed, kept only for not breaking older UBI users) - * @data_size: how many bytes of data this logical eraseblock contains - * @used_ebs: total number of used logical eraseblocks in this volume - * @data_pad: how many bytes at the end of this physical eraseblock are not - * used - * @data_crc: CRC checksum of the data stored in this logical eraseblock - * @padding1: reserved for future, zeroes - * @sqnum: sequence number - * @padding2: reserved for future, zeroes - * @hdr_crc: volume identifier header CRC checksum - * - * The @sqnum is the value of the global sequence counter at the time when this - * VID header was created. The global sequence counter is incremented each time - * UBI writes a new VID header to the flash, i.e. when it maps a logical - * eraseblock to a new physical eraseblock. The global sequence counter is an - * unsigned 64-bit integer and we assume it never overflows. The @sqnum - * (sequence number) is used to distinguish between older and newer versions of - * logical eraseblocks. - * - * There are 2 situations when there may be more then one physical eraseblock - * corresponding to the same logical eraseblock, i.e., having the same @vol_id - * and @lnum values in the volume identifier header. Suppose we have a logical - * eraseblock L and it is mapped to the physical eraseblock P. - * - * 1. Because UBI may erase physical eraseblocks asynchronously, the following - * situation is possible: L is asynchronously erased, so P is scheduled for - * erasure, then L is written to,i.e. mapped to another physical eraseblock P1, - * so P1 is written to, then an unclean reboot happens. Result - there are 2 - * physical eraseblocks P and P1 corresponding to the same logical eraseblock - * L. But P1 has greater sequence number, so UBI picks P1 when it attaches the - * flash. - * - * 2. From time to time UBI moves logical eraseblocks to other physical - * eraseblocks for wear-leveling reasons. If, for example, UBI moves L from P - * to P1, and an unclean reboot happens before P is physically erased, there - * are two physical eraseblocks P and P1 corresponding to L and UBI has to - * select one of them when the flash is attached. The @sqnum field says which - * PEB is the original (obviously P will have lower @sqnum) and the copy. But - * it is not enough to select the physical eraseblock with the higher sequence - * number, because the unclean reboot could have happen in the middle of the - * copying process, so the data in P is corrupted. It is also not enough to - * just select the physical eraseblock with lower sequence number, because the - * data there may be old (consider a case if more data was added to P1 after - * the copying). Moreover, the unclean reboot may happen when the erasure of P - * was just started, so it result in unstable P, which is "mostly" OK, but - * still has unstable bits. - * - * UBI uses the @copy_flag field to indicate that this logical eraseblock is a - * copy. UBI also calculates data CRC when the data is moved and stores it at - * the @data_crc field of the copy (P1). So when UBI needs to pick one physical - * eraseblock of two (P or P1), the @copy_flag of the newer one (P1) is - * examined. If it is cleared, the situation* is simple and the newer one is - * picked. If it is set, the data CRC of the copy (P1) is examined. If the CRC - * checksum is correct, this physical eraseblock is selected (P1). Otherwise - * the older one (P) is selected. - * - * Note, there is an obsolete @leb_ver field which was used instead of @sqnum - * in the past. But it is not used anymore and we keep it in order to be able - * to deal with old UBI images. It will be removed at some point. - * - * There are 2 sorts of volumes in UBI: user volumes and internal volumes. - * Internal volumes are not seen from outside and are used for various internal - * UBI purposes. In this implementation there is only one internal volume - the - * layout volume. Internal volumes are the main mechanism of UBI extensions. - * For example, in future one may introduce a journal internal volume. Internal - * volumes have their own reserved range of IDs. - * - * The @compat field is only used for internal volumes and contains the "degree - * of their compatibility". It is always zero for user volumes. This field - * provides a mechanism to introduce UBI extensions and to be still compatible - * with older UBI binaries. For example, if someone introduced a journal in - * future, he would probably use %UBI_COMPAT_DELETE compatibility for the - * journal volume. And in this case, older UBI binaries, which know nothing - * about the journal volume, would just delete this volume and work perfectly - * fine. This is similar to what Ext2fs does when it is fed by an Ext3fs image - * - it just ignores the Ext3fs journal. - * - * The @data_crc field contains the CRC checksum of the contents of the logical - * eraseblock if this is a static volume. In case of dynamic volumes, it does - * not contain the CRC checksum as a rule. The only exception is when the - * data of the physical eraseblock was moved by the wear-leveling unit, then - * the wear-leveling unit calculates the data CRC and stores it in the - * @data_crc field. And of course, the @copy_flag is %in this case. - * - * The @data_size field is used only for static volumes because UBI has to know - * how many bytes of data are stored in this eraseblock. For dynamic volumes, - * this field usually contains zero. The only exception is when the data of the - * physical eraseblock was moved to another physical eraseblock for - * wear-leveling reasons. In this case, UBI calculates CRC checksum of the - * contents and uses both @data_crc and @data_size fields. In this case, the - * @data_size field contains data size. - * - * The @used_ebs field is used only for static volumes and indicates how many - * eraseblocks the data of the volume takes. For dynamic volumes this field is - * not used and always contains zero. - * - * The @data_pad is calculated when volumes are created using the alignment - * parameter. So, effectively, the @data_pad field reduces the size of logical - * eraseblocks of this volume. This is very handy when one uses block-oriented - * software (say, cramfs) on top of the UBI volume. - */ -struct ubi_vid_hdr { - __be32 magic; - __u8 version; - __u8 vol_type; - __u8 copy_flag; - __u8 compat; - __be32 vol_id; - __be32 lnum; - __be32 leb_ver; /* obsolete, to be removed, don't use */ - __be32 data_size; - __be32 used_ebs; - __be32 data_pad; - __be32 data_crc; - __u8 padding1[4]; - __be64 sqnum; - __u8 padding2[12]; - __be32 hdr_crc; -} __attribute__ ((packed)); - -/* Internal UBI volumes count */ -#define UBI_INT_VOL_COUNT 1 - -/* - * Starting ID of internal volumes. There is reserved room for 4096 internal - * volumes. - */ -#define UBI_INTERNAL_VOL_START (0x7FFFFFFF - 4096) - -/* The layout volume contains the volume table */ - -#define UBI_LAYOUT_VOL_ID UBI_INTERNAL_VOL_START -#define UBI_LAYOUT_VOLUME_EBS 2 -#define UBI_LAYOUT_VOLUME_NAME "layout volume" -#define UBI_LAYOUT_VOLUME_COMPAT UBI_COMPAT_REJECT - -/* The maximum number of volumes per one UBI device */ -#define UBI_MAX_VOLUMES 128 - -/* The maximum volume name length */ -#define UBI_VOL_NAME_MAX 127 - -/* Size of the volume table record */ -#define UBI_VTBL_RECORD_SIZE sizeof(struct ubi_vtbl_record) - -/* Size of the volume table record without the ending CRC */ -#define UBI_VTBL_RECORD_SIZE_CRC (UBI_VTBL_RECORD_SIZE - sizeof(__be32)) - -/** - * struct ubi_vtbl_record - a record in the volume table. - * @reserved_pebs: how many physical eraseblocks are reserved for this volume - * @alignment: volume alignment - * @data_pad: how many bytes are unused at the end of the each physical - * eraseblock to satisfy the requested alignment - * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME) - * @upd_marker: if volume update was started but not finished - * @name_len: volume name length - * @name: the volume name - * @padding2: reserved, zeroes - * @crc: a CRC32 checksum of the record - * - * The volume table records are stored in the volume table, which is stored in - * the layout volume. The layout volume consists of 2 logical eraseblock, each - * of which contains a copy of the volume table (i.e., the volume table is - * duplicated). The volume table is an array of &struct ubi_vtbl_record - * objects indexed by the volume ID. - * - * If the size of the logical eraseblock is large enough to fit - * %UBI_MAX_VOLUMES records, the volume table contains %UBI_MAX_VOLUMES - * records. Otherwise, it contains as many records as it can fit (i.e., size of - * logical eraseblock divided by sizeof(struct ubi_vtbl_record)). - * - * The @upd_marker flag is used to implement volume update. It is set to %1 - * before update and set to %0 after the update. So if the update operation was - * interrupted, UBI knows that the volume is corrupted. - * - * The @alignment field is specified when the volume is created and cannot be - * later changed. It may be useful, for example, when a block-oriented file - * system works on top of UBI. The @data_pad field is calculated using the - * logical eraseblock size and @alignment. The alignment must be multiple to the - * minimal flash I/O unit. If @alignment is 1, all the available space of - * the physical eraseblocks is used. - * - * Empty records contain all zeroes and the CRC checksum of those zeroes. - */ -struct ubi_vtbl_record { - __be32 reserved_pebs; - __be32 alignment; - __be32 data_pad; - __u8 vol_type; - __u8 upd_marker; - __be16 name_len; - __u8 name[UBI_VOL_NAME_MAX+1]; - __u8 padding2[24]; - __be32 crc; -} __attribute__ ((packed)); - -#endif /* !__UBI_HEADER_H__ */ diff -Nurd linux-2.6.24/include/mtd/ubi-user.h ubifs-v2.6.24/include/mtd/ubi-user.h --- linux-2.6.24/include/mtd/ubi-user.h 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/include/mtd/ubi-user.h 2009-04-07 17:14:47.000000000 +0200 @@ -22,33 +22,55 @@ #define __UBI_USER_H__ /* + * UBI device creation (the same as MTD device attachment) + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * MTD devices may be attached using %UBI_IOCATT ioctl command of the UBI + * control device. The caller has to properly fill and pass + * &struct ubi_attach_req object - UBI will attach the MTD device specified in + * the request and return the newly created UBI device number as the ioctl + * return value. + * + * UBI device deletion (the same as MTD device detachment) + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * An UBI device maybe deleted with %UBI_IOCDET ioctl command of the UBI + * control device. + * * UBI volume creation * ~~~~~~~~~~~~~~~~~~~ * - * UBI volumes are created via the %UBI_IOCMKVOL IOCTL command of UBI character + * UBI volumes are created via the %UBI_IOCMKVOL ioctl command of UBI character * device. A &struct ubi_mkvol_req object has to be properly filled and a - * pointer to it has to be passed to the IOCTL. + * pointer to it has to be passed to the ioctl. * * UBI volume deletion * ~~~~~~~~~~~~~~~~~~~ * - * To delete a volume, the %UBI_IOCRMVOL IOCTL command of the UBI character + * To delete a volume, the %UBI_IOCRMVOL ioctl command of the UBI character * device should be used. A pointer to the 32-bit volume ID hast to be passed - * to the IOCTL. + * to the ioctl. * * UBI volume re-size * ~~~~~~~~~~~~~~~~~~ * - * To re-size a volume, the %UBI_IOCRSVOL IOCTL command of the UBI character + * To re-size a volume, the %UBI_IOCRSVOL ioctl command of the UBI character * device should be used. A &struct ubi_rsvol_req object has to be properly - * filled and a pointer to it has to be passed to the IOCTL. + * filled and a pointer to it has to be passed to the ioctl. + * + * UBI volumes re-name + * ~~~~~~~~~~~~~~~~~~~ + * + * To re-name several volumes atomically at one go, the %UBI_IOCRNVOL command + * of the UBI character device should be used. A &struct ubi_rnvol_req object + * has to be properly filled and a pointer to it has to be passed to the ioctl. * * UBI volume update * ~~~~~~~~~~~~~~~~~ * - * Volume update should be done via the %UBI_IOCVOLUP IOCTL command of the + * Volume update should be done via the %UBI_IOCVOLUP ioctl command of the * corresponding UBI volume character device. A pointer to a 64-bit update - * size should be passed to the IOCTL. After then, UBI expects user to write + * size should be passed to the ioctl. After this, UBI expects user to write * this number of bytes to the volume character device. The update is finished * when the claimed number of bytes is passed. So, the volume update sequence * is something like: @@ -57,19 +79,73 @@ * ioctl(fd, UBI_IOCVOLUP, &image_size); * write(fd, buf, image_size); * close(fd); + * + * Logical eraseblock erase + * ~~~~~~~~~~~~~~~~~~~~~~~~ + * + * To erase a logical eraseblock, the %UBI_IOCEBER ioctl command of the + * corresponding UBI volume character device should be used. This command + * unmaps the requested logical eraseblock, makes sure the corresponding + * physical eraseblock is successfully erased, and returns. + * + * Atomic logical eraseblock change + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * Atomic logical eraseblock change operation is called using the %UBI_IOCEBCH + * ioctl command of the corresponding UBI volume character device. A pointer to + * a &struct ubi_leb_change_req object has to be passed to the ioctl. Then the + * user is expected to write the requested amount of bytes (similarly to what + * should be done in case of the "volume update" ioctl). + * + * Logical eraseblock map + * ~~~~~~~~~~~~~~~~~~~~~ + * + * To map a logical eraseblock to a physical eraseblock, the %UBI_IOCEBMAP + * ioctl command should be used. A pointer to a &struct ubi_map_req object is + * expected to be passed. The ioctl maps the requested logical eraseblock to + * a physical eraseblock and returns. Only non-mapped logical eraseblocks can + * be mapped. If the logical eraseblock specified in the request is already + * mapped to a physical eraseblock, the ioctl fails and returns error. + * + * Logical eraseblock unmap + * ~~~~~~~~~~~~~~~~~~~~~~~~ + * + * To unmap a logical eraseblock to a physical eraseblock, the %UBI_IOCEBUNMAP + * ioctl command should be used. The ioctl unmaps the logical eraseblocks, + * schedules corresponding physical eraseblock for erasure, and returns. Unlike + * the "LEB erase" command, it does not wait for the physical eraseblock being + * erased. Note, the side effect of this is that if an unclean reboot happens + * after the unmap ioctl returns, you may find the LEB mapped again to the same + * physical eraseblock after the UBI is run again. + * + * Check if logical eraseblock is mapped + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * To check if a logical eraseblock is mapped to a physical eraseblock, the + * %UBI_IOCEBISMAP ioctl command should be used. It returns %0 if the LEB is + * not mapped, and %1 if it is mapped. + * + * Set an UBI volume property + * ~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * To set an UBI volume property the %UBI_IOCSETPROP ioctl command should be + * used. A pointer to a &struct ubi_set_prop_req object is expected to be + * passed. The object describes which property should be set, and to which value + * it should be set. */ /* - * When a new volume is created, users may either specify the volume number they - * want to create or to let UBI automatically assign a volume number using this - * constant. + * When a new UBI volume or UBI device is created, users may either specify the + * volume/device number they want to create or to let UBI automatically assign + * the number using these constants. */ #define UBI_VOL_NUM_AUTO (-1) +#define UBI_DEV_NUM_AUTO (-1) /* Maximum volume name length */ #define UBI_MAX_VOLUME_NAME 127 -/* IOCTL commands of UBI character devices */ +/* ioctl commands of UBI character devices */ #define UBI_IOC_MAGIC 'o' @@ -79,15 +155,59 @@ #define UBI_IOCRMVOL _IOW(UBI_IOC_MAGIC, 1, int32_t) /* Re-size an UBI volume */ #define UBI_IOCRSVOL _IOW(UBI_IOC_MAGIC, 2, struct ubi_rsvol_req) +/* Re-name volumes */ +#define UBI_IOCRNVOL _IOW(UBI_IOC_MAGIC, 3, struct ubi_rnvol_req) -/* IOCTL commands of UBI volume character devices */ +/* ioctl commands of the UBI control character device */ + +#define UBI_CTRL_IOC_MAGIC 'o' + +/* Attach an MTD device */ +#define UBI_IOCATT _IOW(UBI_CTRL_IOC_MAGIC, 64, struct ubi_attach_req) +/* Detach an MTD device */ +#define UBI_IOCDET _IOW(UBI_CTRL_IOC_MAGIC, 65, int32_t) + +/* ioctl commands of UBI volume character devices */ #define UBI_VOL_IOC_MAGIC 'O' /* Start UBI volume update */ #define UBI_IOCVOLUP _IOW(UBI_VOL_IOC_MAGIC, 0, int64_t) -/* An eraseblock erasure command, used for debugging, disabled by default */ +/* LEB erasure command, used for debugging, disabled by default */ #define UBI_IOCEBER _IOW(UBI_VOL_IOC_MAGIC, 1, int32_t) +/* Atomic LEB change command */ +#define UBI_IOCEBCH _IOW(UBI_VOL_IOC_MAGIC, 2, int32_t) +/* Map LEB command */ +#define UBI_IOCEBMAP _IOW(UBI_VOL_IOC_MAGIC, 3, struct ubi_map_req) +/* Unmap LEB command */ +#define UBI_IOCEBUNMAP _IOW(UBI_VOL_IOC_MAGIC, 4, int32_t) +/* Check if LEB is mapped command */ +#define UBI_IOCEBISMAP _IOR(UBI_VOL_IOC_MAGIC, 5, int32_t) +/* Set an UBI volume property */ +#define UBI_IOCSETPROP _IOW(UBI_VOL_IOC_MAGIC, 6, struct ubi_set_prop_req) + +/* Maximum MTD device name length supported by UBI */ +#define MAX_UBI_MTD_NAME_LEN 127 + +/* Maximum amount of UBI volumes that can be re-named at one go */ +#define UBI_MAX_RNVOL 32 + +/* + * UBI data type hint constants. + * + * UBI_LONGTERM: long-term data + * UBI_SHORTTERM: short-term data + * UBI_UNKNOWN: data persistence is unknown + * + * These constants are used when data is written to UBI volumes in order to + * help the UBI wear-leveling unit to find more appropriate physical + * eraseblocks. + */ +enum { + UBI_LONGTERM = 1, + UBI_SHORTTERM = 2, + UBI_UNKNOWN = 3, +}; /* * UBI volume type constants. @@ -97,22 +217,68 @@ */ enum { UBI_DYNAMIC_VOLUME = 3, - UBI_STATIC_VOLUME = 4 + UBI_STATIC_VOLUME = 4, +}; + +/* + * UBI set property ioctl constants + * + * @UBI_PROP_DIRECT_WRITE: allow / disallow user to directly write and + * erase individual eraseblocks on dynamic volumes + */ +enum { + UBI_PROP_DIRECT_WRITE = 1, +}; + +/** + * struct ubi_attach_req - attach MTD device request. + * @ubi_num: UBI device number to create + * @mtd_num: MTD device number to attach + * @vid_hdr_offset: VID header offset (use defaults if %0) + * @padding: reserved for future, not used, has to be zeroed + * + * This data structure is used to specify MTD device UBI has to attach and the + * parameters it has to use. The number which should be assigned to the new UBI + * device is passed in @ubi_num. UBI may automatically assign the number if + * @UBI_DEV_NUM_AUTO is passed. In this case, the device number is returned in + * @ubi_num. + * + * Most applications should pass %0 in @vid_hdr_offset to make UBI use default + * offset of the VID header within physical eraseblocks. The default offset is + * the next min. I/O unit after the EC header. For example, it will be offset + * 512 in case of a 512 bytes page NAND flash with no sub-page support. Or + * it will be 512 in case of a 2KiB page NAND flash with 4 512-byte sub-pages. + * + * But in rare cases, if this optimizes things, the VID header may be placed to + * a different offset. For example, the boot-loader might do things faster if + * the VID header sits at the end of the first 2KiB NAND page with 4 sub-pages. + * As the boot-loader would not normally need to read EC headers (unless it + * needs UBI in RW mode), it might be faster to calculate ECC. This is weird + * example, but it real-life example. So, in this example, @vid_hdr_offer would + * be 2KiB-64 bytes = 1984. Note, that this position is not even 512-bytes + * aligned, which is OK, as UBI is clever enough to realize this is 4th + * sub-page of the first page and add needed padding. + */ +struct ubi_attach_req { + int32_t ubi_num; + int32_t mtd_num; + int32_t vid_hdr_offset; + int8_t padding[12]; }; /** * struct ubi_mkvol_req - volume description data structure used in - * volume creation requests. + * volume creation requests. * @vol_id: volume number * @alignment: volume alignment * @bytes: volume size in bytes * @vol_type: volume type (%UBI_DYNAMIC_VOLUME or %UBI_STATIC_VOLUME) - * @padding1: reserved for future, not used + * @padding1: reserved for future, not used, has to be zeroed * @name_len: volume name length - * @padding2: reserved for future, not used + * @padding2: reserved for future, not used, has to be zeroed * @name: volume name * - * This structure is used by userspace programs when creating new volumes. The + * This structure is used by user-space programs when creating new volumes. The * @used_bytes field is only necessary when creating static volumes. * * The @alignment field specifies the required alignment of the volume logical @@ -139,7 +305,7 @@ int8_t padding1; int16_t name_len; int8_t padding2[4]; - char name[UBI_MAX_VOLUME_NAME+1]; + char name[UBI_MAX_VOLUME_NAME + 1]; } __attribute__ ((packed)); /** @@ -158,4 +324,87 @@ int32_t vol_id; } __attribute__ ((packed)); +/** + * struct ubi_rnvol_req - volumes re-name request. + * @count: count of volumes to re-name + * @padding1: reserved for future, not used, has to be zeroed + * @vol_id: ID of the volume to re-name + * @name_len: name length + * @padding2: reserved for future, not used, has to be zeroed + * @name: new volume name + * + * UBI allows to re-name up to %32 volumes at one go. The count of volumes to + * re-name is specified in the @count field. The ID of the volumes to re-name + * and the new names are specified in the @vol_id and @name fields. + * + * The UBI volume re-name operation is atomic, which means that should power cut + * happen, the volumes will have either old name or new name. So the possible + * use-cases of this command is atomic upgrade. Indeed, to upgrade, say, volumes + * A and B one may create temporary volumes %A1 and %B1 with the new contents, + * then atomically re-name A1->A and B1->B, in which case old %A and %B will + * be removed. + * + * If it is not desirable to remove old A and B, the re-name request has to + * contain 4 entries: A1->A, A->A1, B1->B, B->B1, in which case old A1 and B1 + * become A and B, and old A and B will become A1 and B1. + * + * It is also OK to request: A1->A, A1->X, B1->B, B->Y, in which case old A1 + * and B1 become A and B, and old A and B become X and Y. + * + * In other words, in case of re-naming into an existing volume name, the + * existing volume is removed, unless it is re-named as well at the same + * re-name request. + */ +struct ubi_rnvol_req { + int32_t count; + int8_t padding1[12]; + struct { + int32_t vol_id; + int16_t name_len; + int8_t padding2[2]; + char name[UBI_MAX_VOLUME_NAME + 1]; + } ents[UBI_MAX_RNVOL]; +} __attribute__ ((packed)); + +/** + * struct ubi_leb_change_req - a data structure used in atomic LEB change + * requests. + * @lnum: logical eraseblock number to change + * @bytes: how many bytes will be written to the logical eraseblock + * @dtype: data type (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN) + * @padding: reserved for future, not used, has to be zeroed + */ +struct ubi_leb_change_req { + int32_t lnum; + int32_t bytes; + int8_t dtype; + int8_t padding[7]; +} __attribute__ ((packed)); + +/** + * struct ubi_map_req - a data structure used in map LEB requests. + * @lnum: logical eraseblock number to unmap + * @dtype: data type (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN) + * @padding: reserved for future, not used, has to be zeroed + */ +struct ubi_map_req { + int32_t lnum; + int8_t dtype; + int8_t padding[3]; +} __attribute__ ((packed)); + + +/** + * struct ubi_set_prop_req - a data structure used to set an ubi volume + * property. + * @property: property to set (%UBI_PROP_DIRECT_WRITE) + * @padding: reserved for future, not used, has to be zeroed + * @value: value to set + */ +struct ubi_set_prop_req { + uint8_t property; + uint8_t padding[7]; + uint64_t value; +} __attribute__ ((packed)); + #endif /* __UBI_USER_H__ */ diff -Nurd linux-2.6.24/init/do_mounts.c ubifs-v2.6.24/init/do_mounts.c --- linux-2.6.24/init/do_mounts.c 2008-01-25 00:58:37.000000000 +0200 +++ ubifs-v2.6.24/init/do_mounts.c 2009-04-07 17:14:47.000000000 +0200 @@ -440,7 +440,8 @@ if (saved_root_name[0]) { root_device_name = saved_root_name; - if (!strncmp(root_device_name, "mtd", 3)) { + if (!strncmp(root_device_name, "mtd", 3) || + !strncmp(root_device_name, "ubi", 3)) { mount_block_root(root_device_name, root_mountflags); goto out; }