diff -u --recursive --new-file v2.4.0/linux/Documentation/Changes linux/Documentation/Changes --- v2.4.0/linux/Documentation/Changes Mon Jan 1 10:00:04 2001 +++ linux/Documentation/Changes Mon Jan 8 15:18:32 2001 @@ -56,7 +56,7 @@ o e2fsprogs 1.19 # tune2fs --version o pcmcia-cs 3.1.21 # cardmgr -V o PPP 2.4.0 # pppd --version -o isdn4k-utils 3.1beta7 # isdnctrl 2>&1|grep version +o isdn4k-utils 3.1pre1 # isdnctrl 2>&1|grep version Kernel compilation ================== diff -u --recursive --new-file v2.4.0/linux/Documentation/Configure.help linux/Documentation/Configure.help --- v2.4.0/linux/Documentation/Configure.help Thu Jan 4 13:00:55 2001 +++ linux/Documentation/Configure.help Mon Jan 15 12:42:32 2001 @@ -10745,6 +10745,46 @@ called minix.o. Note that the file system of your root partition (the one containing the directory /) cannot be compiled as a module. +Reiserfs support +CONFIG_REISERFS_FS + + Stores not just filenames but the files themselves in a balanced + tree. Uses journaling. + + Balanced trees are more efficient than traditional + filesystem architectural foundations. + + You can use reiserfs in all cases where you use the ext2fs file + system, and you will gain in speed and disk space. It has fewer + worst case performance situations than other file systems + because balanced trees are hardier creatures than other algorithms + are (if that is not technical enough, read www.namesys.com....:-) ) + + It is more easily extended to have features currently found in + database and keyword search systems than block allocation based + filesystems are. The next version will be so extended, and will + support plugins consistent with our motto ``It takes more than a + license to make source code open.'' + + Read www.namesys.com to learn more about reiserfs. + + Sponsored by Threshold Networks, Emusic.com, and Bigstorage.com. + + If you like it, you can pay us to add new features to it that you + need, buy a support contract, or pay us to port it to another OS. + +Enable Reiserfs consistency checks +CONFIG_REISERFS_CHECK + If you set this to yes, then ReiserFS will perform every check it + can possibly imagine of its internal consistency throughout its + operation. It will also go substantially slower. More than once we + have forgotten that this was on, and then gone despondent over the + latest benchmarks.:-) Use of this option allows our team to go all + out in checking for consistency when debugging without fear of its + effect on end users. If you are on the verge of sending in a bug + report, say yes and you might get a useful error message. Almost + everyone should say no. + Second extended fs support CONFIG_EXT2_FS This is the de facto standard Linux file system (method to organize @@ -14618,6 +14658,14 @@ CONFIG_ISDN_PPP_VJ This enables Van Jacobson header compression for synchronous PPP. Say Y if the other end of the connection supports it. + +CONFIG_ISDN_PPP_BSDCOMP + Support for the BSD-Compress compression method for PPP, which uses + the LZW compression method to compress each PPP packet before it is + sent over the wire. The machine at the other end of the PPP link + (usually your ISP) has to support the BSD-Compress compression + method as well for this to be useful. Even if they don't support it, + it is safe to say Y here. Support audio via ISDN CONFIG_ISDN_AUDIO diff -u --recursive --new-file v2.4.0/linux/Makefile linux/Makefile --- v2.4.0/linux/Makefile Thu Jan 4 13:48:13 2001 +++ linux/Makefile Mon Jan 15 17:23:48 2001 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 -SUBLEVEL = 0 -EXTRAVERSION = +SUBLEVEL = 1 +EXTRAVERSION =-pre7 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) @@ -457,9 +457,8 @@ depend dep: dep-files -# make checkconfig: Prune 'scripts' directory to avoid "false positives". checkconfig: - find * -name '*.[hcS]' -type f -print | grep -v scripts/ | sort | xargs $(PERL) -w scripts/checkconfig.pl + find * -name '*.[hcS]' -type f -print | sort | xargs $(PERL) -w scripts/checkconfig.pl checkhelp: find * -name [cC]onfig.in -print | sort | xargs $(PERL) -w scripts/checkhelp.pl diff -u --recursive --new-file v2.4.0/linux/arch/i386/Makefile linux/arch/i386/Makefile --- v2.4.0/linux/arch/i386/Makefile Fri Dec 29 14:07:19 2000 +++ linux/arch/i386/Makefile Wed Jan 10 15:06:14 2001 @@ -50,7 +50,7 @@ CFLAGS += -march=i686 endif -ifdef CONFIG_M686FXSR +ifdef CONFIG_MPENTIUMIII CFLAGS += -march=i686 endif diff -u --recursive --new-file v2.4.0/linux/arch/i386/config.in linux/arch/i386/config.in --- v2.4.0/linux/arch/i386/config.in Fri Dec 29 14:35:47 2000 +++ linux/arch/i386/config.in Mon Jan 8 13:27:56 2001 @@ -33,7 +33,7 @@ Pentium-Classic CONFIG_M586TSC \ Pentium-MMX CONFIG_M586MMX \ Pentium-Pro/Celeron/Pentium-II CONFIG_M686 \ - Pentium-III CONFIG_M686FXSR \ + Pentium-III CONFIG_MPENTIUMIII \ Pentium-4 CONFIG_MPENTIUM4 \ K6/K6-II/K6-III CONFIG_MK6 \ Athlon/K7 CONFIG_MK7 \ @@ -45,8 +45,6 @@ # Define implied options from the CPU selection here # -unset CONFIG_X86_FXSR - if [ "$CONFIG_M386" = "y" ]; then define_bool CONFIG_X86_CMPXCHG n define_int CONFIG_X86_L1_CACHE_SHIFT 4 @@ -87,14 +85,12 @@ define_bool CONFIG_X86_PGE y define_bool CONFIG_X86_USE_PPRO_CHECKSUM y fi -if [ "$CONFIG_M686FXSR" = "y" ]; then +if [ "$CONFIG_MPENTIUMIII" = "y" ]; then define_int CONFIG_X86_L1_CACHE_SHIFT 5 define_bool CONFIG_X86_TSC y define_bool CONFIG_X86_GOOD_APIC y define_bool CONFIG_X86_PGE y define_bool CONFIG_X86_USE_PPRO_CHECKSUM y - define_bool CONFIG_X86_FXSR y - define_bool CONFIG_X86_XMM y fi if [ "$CONFIG_MPENTIUM4" = "y" ]; then define_int CONFIG_X86_L1_CACHE_SHIFT 7 @@ -102,8 +98,6 @@ define_bool CONFIG_X86_GOOD_APIC y define_bool CONFIG_X86_PGE y define_bool CONFIG_X86_USE_PPRO_CHECKSUM y - define_bool CONFIG_X86_FXSR y - define_bool CONFIG_X86_XMM y fi if [ "$CONFIG_MK6" = "y" ]; then define_int CONFIG_X86_L1_CACHE_SHIFT 5 @@ -158,9 +152,7 @@ define_bool CONFIG_X86_PAE y fi -if [ "$CONFIG_X86_FXSR" != "y" ]; then - bool 'Math emulation' CONFIG_MATH_EMULATION -fi +bool 'Math emulation' CONFIG_MATH_EMULATION bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR bool 'Symmetric multi-processing support' CONFIG_SMP if [ "$CONFIG_SMP" != "y" ]; then diff -u --recursive --new-file v2.4.0/linux/arch/i386/defconfig linux/arch/i386/defconfig --- v2.4.0/linux/arch/i386/defconfig Sun Dec 31 09:17:18 2000 +++ linux/arch/i386/defconfig Mon Jan 15 12:49:47 2001 @@ -27,7 +27,7 @@ # CONFIG_M586TSC is not set # CONFIG_M586MMX is not set # CONFIG_M686 is not set -CONFIG_M686FXSR=y +CONFIG_MPENTIUMIII=y # CONFIG_MPENTIUM4 is not set # CONFIG_MK6 is not set # CONFIG_MK7 is not set @@ -45,8 +45,6 @@ CONFIG_X86_GOOD_APIC=y CONFIG_X86_PGE=y CONFIG_X86_USE_PPRO_CHECKSUM=y -CONFIG_X86_FXSR=y -CONFIG_X86_XMM=y # CONFIG_TOSHIBA is not set # CONFIG_MICROCODE is not set # CONFIG_X86_MSR is not set @@ -54,6 +52,7 @@ CONFIG_NOHIGHMEM=y # CONFIG_HIGHMEM4G is not set # CONFIG_HIGHMEM64G is not set +# CONFIG_MATH_EMULATION is not set # CONFIG_MTRR is not set CONFIG_SMP=y CONFIG_HAVE_DEC_LOCK=y @@ -538,6 +537,8 @@ # CONFIG_QUOTA is not set # CONFIG_AUTOFS_FS is not set CONFIG_AUTOFS4_FS=y +# CONFIG_REISERFS_FS is not set +# CONFIG_REISERFS_CHECK is not set # CONFIG_ADFS_FS is not set # CONFIG_ADFS_FS_RW is not set # CONFIG_AFFS_FS is not set diff -u --recursive --new-file v2.4.0/linux/arch/i386/kernel/i387.c linux/arch/i386/kernel/i387.c --- v2.4.0/linux/arch/i386/kernel/i387.c Fri Nov 3 09:47:48 2000 +++ linux/arch/i386/kernel/i387.c Thu Jan 11 17:12:18 2001 @@ -18,14 +18,6 @@ #include #include -#if defined(CONFIG_X86_FXSR) -#define HAVE_FXSR 1 -#elif defined(CONFIG_X86_RUNTIME_FXSR) -#define HAVE_FXSR (cpu_has_fxsr) -#else -#define HAVE_FXSR 0 -#endif - #ifdef CONFIG_MATH_EMULATION #define HAVE_HWFP (boot_cpu_data.hard_math) #else @@ -35,13 +27,13 @@ /* * The _current_ task is using the FPU for the first time * so initialize it and set the mxcsr to its default - * value at reset if we support FXSR and then + * value at reset if we support XMM instructions and then * remeber the current task has used the FPU. */ void init_fpu(void) { __asm__("fninit"); - if ( HAVE_FXSR ) + if ( cpu_has_xmm ) load_mxcsr(0x1f80); current->used_math = 1; @@ -51,9 +43,9 @@ * FPU lazy state save handling. */ -void save_init_fpu( struct task_struct *tsk ) +static inline void __save_init_fpu( struct task_struct *tsk ) { - if ( HAVE_FXSR ) { + if ( cpu_has_fxsr ) { asm volatile( "fxsave %0 ; fnclex" : "=m" (tsk->thread.i387.fxsave) ); } else { @@ -61,12 +53,28 @@ : "=m" (tsk->thread.i387.fsave) ); } tsk->flags &= ~PF_USEDFPU; +} + +void save_init_fpu( struct task_struct *tsk ) +{ + __save_init_fpu(tsk); stts(); } +void kernel_fpu_begin(void) +{ + struct task_struct *tsk = current; + + if (tsk->flags & PF_USEDFPU) { + __save_init_fpu(tsk); + return; + } + clts(); +} + void restore_fpu( struct task_struct *tsk ) { - if ( HAVE_FXSR ) { + if ( cpu_has_fxsr ) { asm volatile( "fxrstor %0" : : "m" (tsk->thread.i387.fxsave) ); } else { @@ -144,7 +152,7 @@ unsigned short get_fpu_cwd( struct task_struct *tsk ) { - if ( HAVE_FXSR ) { + if ( cpu_has_fxsr ) { return tsk->thread.i387.fxsave.cwd; } else { return (unsigned short)tsk->thread.i387.fsave.cwd; @@ -153,7 +161,7 @@ unsigned short get_fpu_swd( struct task_struct *tsk ) { - if ( HAVE_FXSR ) { + if ( cpu_has_fxsr ) { return tsk->thread.i387.fxsave.swd; } else { return (unsigned short)tsk->thread.i387.fsave.swd; @@ -162,7 +170,7 @@ unsigned short get_fpu_twd( struct task_struct *tsk ) { - if ( HAVE_FXSR ) { + if ( cpu_has_fxsr ) { return tsk->thread.i387.fxsave.twd; } else { return (unsigned short)tsk->thread.i387.fsave.twd; @@ -171,7 +179,7 @@ unsigned short get_fpu_mxcsr( struct task_struct *tsk ) { - if ( HAVE_FXSR ) { + if ( cpu_has_fxsr ) { return tsk->thread.i387.fxsave.mxcsr; } else { return 0x1f80; @@ -180,7 +188,7 @@ void set_fpu_cwd( struct task_struct *tsk, unsigned short cwd ) { - if ( HAVE_FXSR ) { + if ( cpu_has_fxsr ) { tsk->thread.i387.fxsave.cwd = cwd; } else { tsk->thread.i387.fsave.cwd = ((long)cwd | 0xffff0000); @@ -189,7 +197,7 @@ void set_fpu_swd( struct task_struct *tsk, unsigned short swd ) { - if ( HAVE_FXSR ) { + if ( cpu_has_fxsr ) { tsk->thread.i387.fxsave.swd = swd; } else { tsk->thread.i387.fsave.swd = ((long)swd | 0xffff0000); @@ -198,7 +206,7 @@ void set_fpu_twd( struct task_struct *tsk, unsigned short twd ) { - if ( HAVE_FXSR ) { + if ( cpu_has_fxsr ) { tsk->thread.i387.fxsave.twd = twd_i387_to_fxsr(twd); } else { tsk->thread.i387.fsave.twd = ((long)twd | 0xffff0000); @@ -207,7 +215,7 @@ void set_fpu_mxcsr( struct task_struct *tsk, unsigned short mxcsr ) { - if ( HAVE_FXSR ) { + if ( cpu_has_xmm ) { tsk->thread.i387.fxsave.mxcsr = mxcsr; } } @@ -321,7 +329,7 @@ current->used_math = 0; if ( HAVE_HWFP ) { - if ( HAVE_FXSR ) { + if ( cpu_has_fxsr ) { return save_i387_fxsave( buf ); } else { return save_i387_fsave( buf ); @@ -354,7 +362,7 @@ int err; if ( HAVE_HWFP ) { - if ( HAVE_FXSR ) { + if ( cpu_has_fxsr ) { err = restore_i387_fxsave( buf ); } else { err = restore_i387_fsave( buf ); @@ -387,7 +395,7 @@ int get_fpregs( struct user_i387_struct *buf, struct task_struct *tsk ) { if ( HAVE_HWFP ) { - if ( HAVE_FXSR ) { + if ( cpu_has_fxsr ) { return get_fpregs_fxsave( buf, tsk ); } else { return get_fpregs_fsave( buf, tsk ); @@ -415,7 +423,7 @@ int set_fpregs( struct task_struct *tsk, struct user_i387_struct *buf ) { if ( HAVE_HWFP ) { - if ( HAVE_FXSR ) { + if ( cpu_has_fxsr ) { return set_fpregs_fxsave( tsk, buf ); } else { return set_fpregs_fsave( tsk, buf ); @@ -428,9 +436,10 @@ int get_fpxregs( struct user_fxsr_struct *buf, struct task_struct *tsk ) { - if ( HAVE_FXSR ) { - __copy_to_user( (void *)buf, &tsk->thread.i387.fxsave, - sizeof(struct user_fxsr_struct) ); + if ( cpu_has_fxsr ) { + if (__copy_to_user( (void *)buf, &tsk->thread.i387.fxsave, + sizeof(struct user_fxsr_struct) )) + return -EFAULT; return 0; } else { return -EIO; @@ -439,7 +448,7 @@ int set_fpxregs( struct task_struct *tsk, struct user_fxsr_struct *buf ) { - if ( HAVE_FXSR ) { + if ( cpu_has_fxsr ) { __copy_from_user( &tsk->thread.i387.fxsave, (void *)buf, sizeof(struct user_fxsr_struct) ); /* mxcsr bit 6 and 31-16 must be zero for security reasons */ @@ -485,7 +494,7 @@ fpvalid = tsk->used_math; if ( fpvalid ) { unlazy_fpu( tsk ); - if ( HAVE_FXSR ) { + if ( cpu_has_fxsr ) { copy_fpu_fxsave( tsk, fpu ); } else { copy_fpu_fsave( tsk, fpu ); @@ -500,7 +509,7 @@ int fpvalid; struct task_struct *tsk = current; - fpvalid = tsk->used_math && HAVE_FXSR; + fpvalid = tsk->used_math && cpu_has_fxsr; if ( fpvalid ) { unlazy_fpu( tsk ); memcpy( fpu, &tsk->thread.i387.fxsave, diff -u --recursive --new-file v2.4.0/linux/arch/i386/kernel/setup.c linux/arch/i386/kernel/setup.c --- v2.4.0/linux/arch/i386/kernel/setup.c Sun Dec 31 10:26:18 2000 +++ linux/arch/i386/kernel/setup.c Mon Jan 15 12:39:32 2001 @@ -147,6 +147,7 @@ extern unsigned long cpu_khz; static int disable_x86_serial_nr __initdata = 1; +static int disable_x86_fxsr __initdata = 0; /* * This is set up by the setup-routine at boot-time @@ -518,7 +519,7 @@ e820.nr_map = 0; add_memory_region(0, LOWMEMSIZE(), E820_RAM); - add_memory_region(HIGH_MEMORY, (mem_size << 10) - HIGH_MEMORY, E820_RAM); + add_memory_region(HIGH_MEMORY, mem_size << 10, E820_RAM); } printk("BIOS-provided physical RAM map:\n"); print_memory_map(who); @@ -1796,6 +1797,13 @@ } __setup("serialnumber", x86_serial_nr_setup); +int __init x86_fxsr_setup(char * s) +{ + disable_x86_fxsr = 1; + return 1; +} +__setup("nofxsr", x86_fxsr_setup); + /* Standard macro to see if a specific flag is changeable */ static inline int flag_is_changeable_p(u32 flag) @@ -2000,10 +2008,16 @@ */ /* TSC disabled? */ -#ifdef CONFIG_TSC +#ifndef CONFIG_X86_TSC if ( tsc_disable ) clear_bit(X86_FEATURE_TSC, &c->x86_capability); #endif + + /* FXSR disabled? */ + if (disable_x86_fxsr) { + clear_bit(X86_FEATURE_FXSR, &c->x86_capability); + clear_bit(X86_FEATURE_XMM, &c->x86_capability); + } /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); diff -u --recursive --new-file v2.4.0/linux/arch/i386/kernel/traps.c linux/arch/i386/kernel/traps.c --- v2.4.0/linux/arch/i386/kernel/traps.c Wed Jan 3 20:45:26 2001 +++ linux/arch/i386/kernel/traps.c Mon Jan 15 16:54:20 2001 @@ -23,6 +23,7 @@ #include #include #include +#include #ifdef CONFIG_MCA #include diff -u --recursive --new-file v2.4.0/linux/arch/i386/lib/mmx.c linux/arch/i386/lib/mmx.c --- v2.4.0/linux/arch/i386/lib/mmx.c Wed Nov 8 17:09:49 2000 +++ linux/arch/i386/lib/mmx.c Thu Jan 11 17:42:24 2001 @@ -2,6 +2,8 @@ #include #include +#include + /* * MMX 3DNow! library helper functions * @@ -26,13 +28,7 @@ void *p=to; int i= len >> 6; /* len/64 */ - if (!(current->flags & PF_USEDFPU)) - clts(); - else - { - __asm__ __volatile__ ( " fnsave %0; fwait\n"::"m"(current->thread.i387)); - current->flags &= ~PF_USEDFPU; - } + kernel_fpu_begin(); __asm__ __volatile__ ( "1: prefetch (%0)\n" /* This set is 28 bytes */ @@ -88,20 +84,15 @@ * Now do the tail of the block */ __memcpy(to, from, len&63); - stts(); + kernel_fpu_end(); return p; } static void fast_clear_page(void *page) { int i; - if (!(current->flags & PF_USEDFPU)) - clts(); - else - { - __asm__ __volatile__ ( " fnsave %0; fwait\n"::"m"(current->thread.i387)); - current->flags &= ~PF_USEDFPU; - } + + kernel_fpu_begin(); __asm__ __volatile__ ( " pxor %%mm0, %%mm0\n" : : @@ -127,19 +118,14 @@ __asm__ __volatile__ ( " sfence \n" : : ); - stts(); + kernel_fpu_end(); } static void fast_copy_page(void *to, void *from) { int i; - if (!(current->flags & PF_USEDFPU)) - clts(); - else - { - __asm__ __volatile__ ( " fnsave %0; fwait\n"::"m"(current->thread.i387)); - current->flags &= ~PF_USEDFPU; - } + + kernel_fpu_begin(); /* maybe the prefetch stuff can go before the expensive fnsave... * but that is for later. -AV @@ -199,7 +185,7 @@ __asm__ __volatile__ ( " sfence \n" : : ); - stts(); + kernel_fpu_end(); } /* diff -u --recursive --new-file v2.4.0/linux/arch/i386/mm/init.c linux/arch/i386/mm/init.c --- v2.4.0/linux/arch/i386/mm/init.c Tue Nov 28 22:43:39 2000 +++ linux/arch/i386/mm/init.c Mon Jan 15 11:06:55 2001 @@ -317,7 +317,7 @@ pgd_t *pgd, *pgd_base; int i, j, k; pmd_t *pmd; - pte_t *pte; + pte_t *pte, *pte_base; /* * This can be zero as well - no problem, in that case we exit @@ -366,11 +366,7 @@ continue; } - pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte))); - - if (pte != pte_offset(pmd, 0)) - BUG(); + pte_base = pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); for (k = 0; k < PTRS_PER_PTE; pte++, k++) { vaddr = i*PGDIR_SIZE + j*PMD_SIZE + k*PAGE_SIZE; @@ -378,6 +374,10 @@ break; *pte = mk_pte_phys(__pa(vaddr), PAGE_KERNEL); } + set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base))); + if (pte_base != pte_offset(pmd, 0)) + BUG(); + } } diff -u --recursive --new-file v2.4.0/linux/drivers/block/DAC960.c linux/drivers/block/DAC960.c --- v2.4.0/linux/drivers/block/DAC960.c Thu Dec 7 17:08:24 2000 +++ linux/drivers/block/DAC960.c Mon Jan 15 13:08:15 2001 @@ -1820,7 +1820,6 @@ Request->nr_segments < Controller->DriverScatterGatherLimit) { Request->nr_segments++; - RequestQueue->elevator.nr_segments++; return true; } return false; @@ -1844,7 +1843,6 @@ Request->nr_segments < Controller->DriverScatterGatherLimit) { Request->nr_segments++; - RequestQueue->elevator.nr_segments++; return true; } return false; @@ -1874,7 +1872,6 @@ if (TotalSegments > MaxSegments || TotalSegments > Controller->DriverScatterGatherLimit) return false; - RequestQueue->elevator.nr_segments -= SameSegment; Request->nr_segments = TotalSegments; return true; } diff -u --recursive --new-file v2.4.0/linux/drivers/block/elevator.c linux/drivers/block/elevator.c --- v2.4.0/linux/drivers/block/elevator.c Tue Dec 5 15:05:26 2000 +++ linux/drivers/block/elevator.c Mon Jan 15 13:08:15 2001 @@ -24,125 +24,115 @@ #include #include #include +#include #include -/* - * Order ascending, but only allow a request to be skipped a certain - * number of times - */ -void elevator_linus(struct request *req, elevator_t *elevator, - struct list_head *real_head, - struct list_head *head, int orig_latency) -{ - struct list_head *entry = real_head; - struct request *tmp; - - req->elevator_sequence = orig_latency; - - while ((entry = entry->prev) != head) { - tmp = blkdev_entry_to_request(entry); - if (IN_ORDER(tmp, req)) - break; - if (!tmp->elevator_sequence) - break; - tmp->elevator_sequence--; - } - list_add(&req->queue, entry); -} - int elevator_linus_merge(request_queue_t *q, struct request **req, + struct list_head * head, struct buffer_head *bh, int rw, - int *max_sectors, int *max_segments) + int max_sectors, int max_segments) { - struct list_head *entry, *head = &q->queue_head; + struct list_head *entry = &q->queue_head; unsigned int count = bh->b_size >> 9, ret = ELEVATOR_NO_MERGE; - entry = head; - if (q->head_active && !q->plugged) - head = head->next; - while ((entry = entry->prev) != head) { - struct request *__rq = *req = blkdev_entry_to_request(entry); + struct request *__rq = blkdev_entry_to_request(entry); + + /* + * simply "aging" of requests in queue + */ + if (__rq->elevator_sequence-- <= 0) { + *req = __rq; + break; + } + if (__rq->sem) continue; if (__rq->cmd != rw) continue; - if (__rq->nr_sectors + count > *max_sectors) - continue; if (__rq->rq_dev != bh->b_rdev) continue; + if (__rq->nr_sectors + count > max_sectors) + continue; + if (__rq->elevator_sequence < count) + break; if (__rq->sector + __rq->nr_sectors == bh->b_rsector) { ret = ELEVATOR_BACK_MERGE; + *req = __rq; break; - } - if (!__rq->elevator_sequence) - break; - if (__rq->sector - count == bh->b_rsector) { - __rq->elevator_sequence--; + } else if (__rq->sector - count == bh->b_rsector) { ret = ELEVATOR_FRONT_MERGE; + __rq->elevator_sequence -= count; + *req = __rq; break; - } + } else if (!*req && BHRQ_IN_ORDER(bh, __rq)) + *req = __rq; } + return ret; +} + +void elevator_linus_merge_cleanup(request_queue_t *q, struct request *req, int count) +{ + struct list_head *entry = &req->queue, *head = &q->queue_head; + /* * second pass scan of requests that got passed over, if any */ - if (ret != ELEVATOR_NO_MERGE && *req) { - while ((entry = entry->next) != &q->queue_head) { - struct request *tmp = blkdev_entry_to_request(entry); - tmp->elevator_sequence--; - } + while ((entry = entry->next) != head) { + struct request *tmp = blkdev_entry_to_request(entry); + tmp->elevator_sequence -= count; } - - return ret; } -/* - * No request sorting, just add it to the back of the list - */ -void elevator_noop(struct request *req, elevator_t *elevator, - struct list_head *real_head, struct list_head *head, - int orig_latency) +void elevator_linus_merge_req(struct request *req, struct request *next) { - list_add_tail(&req->queue, real_head); + if (next->elevator_sequence < req->elevator_sequence) + req->elevator_sequence = next->elevator_sequence; } /* - * See if we can find a request that is buffer can be coalesced with. + * See if we can find a request that this buffer can be coalesced with. */ int elevator_noop_merge(request_queue_t *q, struct request **req, + struct list_head * head, struct buffer_head *bh, int rw, - int *max_sectors, int *max_segments) + int max_sectors, int max_segments) { - struct list_head *entry, *head = &q->queue_head; + struct list_head *entry; unsigned int count = bh->b_size >> 9; - if (q->head_active && !q->plugged) - head = head->next; + if (list_empty(&q->queue_head)) + return ELEVATOR_NO_MERGE; - entry = head; + entry = &q->queue_head; while ((entry = entry->prev) != head) { - struct request *__rq = *req = blkdev_entry_to_request(entry); - if (__rq->sem) - continue; + struct request *__rq = blkdev_entry_to_request(entry); + if (__rq->cmd != rw) continue; - if (__rq->nr_sectors + count > *max_sectors) - continue; if (__rq->rq_dev != bh->b_rdev) continue; - if (__rq->sector + __rq->nr_sectors == bh->b_rsector) + if (__rq->nr_sectors + count > max_sectors) + continue; + if (__rq->sem) + continue; + if (__rq->sector + __rq->nr_sectors == bh->b_rsector) { + *req = __rq; return ELEVATOR_BACK_MERGE; - if (__rq->sector - count == bh->b_rsector) + } else if (__rq->sector - count == bh->b_rsector) { + *req = __rq; return ELEVATOR_FRONT_MERGE; + } } + + *req = blkdev_entry_to_request(q->queue_head.prev); return ELEVATOR_NO_MERGE; } -/* - * The noop "elevator" does not do any accounting - */ -void elevator_noop_dequeue(struct request *req) {} +void elevator_noop_merge_cleanup(request_queue_t *q, struct request *req, int count) {} + +void elevator_noop_merge_req(struct request *req, struct request *next) {} int blkelvget_ioctl(elevator_t * elevator, blkelv_ioctl_arg_t * arg) { diff -u --recursive --new-file v2.4.0/linux/drivers/block/ll_rw_blk.c linux/drivers/block/ll_rw_blk.c --- v2.4.0/linux/drivers/block/ll_rw_blk.c Sun Dec 31 11:16:58 2000 +++ linux/drivers/block/ll_rw_blk.c Mon Jan 15 16:52:57 2001 @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -118,6 +119,19 @@ */ int * max_sectors[MAX_BLKDEV]; +/* + * queued sectors for all devices, used to make sure we don't fill all + * of memory with locked buffers + */ +atomic_t queued_sectors; + +/* + * high and low watermark for above + */ +static int high_queued_sectors, low_queued_sectors; +static int batch_requests, queue_nr_requests; +static DECLARE_WAIT_QUEUE_HEAD(blk_buffers_wait); + static inline int get_max_sectors(kdev_t dev) { if (!max_sectors[MAJOR(dev)]) @@ -125,7 +139,7 @@ return max_sectors[MAJOR(dev)][MINOR(dev)]; } -static inline request_queue_t *__blk_get_queue(kdev_t dev) +inline request_queue_t *__blk_get_queue(kdev_t dev) { struct blk_dev_struct *bdev = blk_dev + MAJOR(dev); @@ -153,17 +167,14 @@ static int __blk_cleanup_queue(struct list_head *head) { - struct list_head *entry; struct request *rq; int i = 0; if (list_empty(head)) return 0; - entry = head->next; do { - rq = list_entry(entry, struct request, table); - entry = entry->next; + rq = list_entry(head->next, struct request, table); list_del(&rq->table); kmem_cache_free(request_cachep, rq); i++; @@ -188,10 +199,12 @@ **/ void blk_cleanup_queue(request_queue_t * q) { - int count = QUEUE_NR_REQUESTS; + int count = queue_nr_requests; count -= __blk_cleanup_queue(&q->request_freelist[READ]); count -= __blk_cleanup_queue(&q->request_freelist[WRITE]); + count -= __blk_cleanup_queue(&q->pending_freelist[READ]); + count -= __blk_cleanup_queue(&q->pending_freelist[WRITE]); if (count) printk("blk_cleanup_queue: leaked requests (%d)\n", count); @@ -290,7 +303,6 @@ { if (req->nr_segments < max_segments) { req->nr_segments++; - q->elevator.nr_segments++; return 1; } return 0; @@ -327,7 +339,6 @@ if (total_segments > max_segments) return 0; - q->elevator.nr_segments -= same_segment; req->nr_segments = total_segments; return 1; } @@ -364,7 +375,7 @@ } } -static void generic_unplug_device(void *data) +void generic_unplug_device(void *data) { request_queue_t *q = (request_queue_t *) data; unsigned long flags; @@ -379,19 +390,24 @@ struct request *rq; int i; + INIT_LIST_HEAD(&q->request_freelist[READ]); + INIT_LIST_HEAD(&q->request_freelist[WRITE]); + INIT_LIST_HEAD(&q->pending_freelist[READ]); + INIT_LIST_HEAD(&q->pending_freelist[WRITE]); + q->pending_free[READ] = q->pending_free[WRITE] = 0; + /* - * Divide requests in half between read and write. This used to - * be a 2/3 advantage for reads, but now reads can steal from - * the write free list. + * Divide requests in half between read and write */ - for (i = 0; i < QUEUE_NR_REQUESTS; i++) { + for (i = 0; i < queue_nr_requests; i++) { rq = kmem_cache_alloc(request_cachep, SLAB_KERNEL); + memset(rq, 0, sizeof(struct request)); rq->rq_status = RQ_INACTIVE; list_add(&rq->table, &q->request_freelist[i & 1]); } init_waitqueue_head(&q->wait_for_request); - spin_lock_init(&q->request_lock); + spin_lock_init(&q->queue_lock); } static int __make_request(request_queue_t * q, int rw, struct buffer_head * bh); @@ -426,14 +442,12 @@ * blk_queue_headactive(). * * Note: - * blk_init_queue() must be paired with a blk_cleanup-queue() call + * blk_init_queue() must be paired with a blk_cleanup_queue() call * when the block device is deactivated (such as at module unload). **/ void blk_init_queue(request_queue_t * q, request_fn_proc * rfn) { INIT_LIST_HEAD(&q->queue_head); - INIT_LIST_HEAD(&q->request_freelist[READ]); - INIT_LIST_HEAD(&q->request_freelist[WRITE]); elevator_init(&q->elevator, ELEVATOR_LINUS); blk_init_free_list(q); q->request_fn = rfn; @@ -455,7 +469,6 @@ q->head_active = 1; } - #define blkdev_free_rq(list) list_entry((list)->next, struct request, table); /* * Get a free request. io_request_lock must be held and interrupts @@ -463,37 +476,16 @@ */ static inline struct request *get_request(request_queue_t *q, int rw) { - struct list_head *list = &q->request_freelist[rw]; - struct request *rq; - - /* - * Reads get preferential treatment and are allowed to steal - * from the write free list if necessary. - */ - if (!list_empty(list)) { - rq = blkdev_free_rq(list); - goto got_rq; - } + struct request *rq = NULL; - /* - * if the WRITE list is non-empty, we know that rw is READ - * and that the READ list is empty. allow reads to 'steal' - * from the WRITE list. - */ - if (!list_empty(&q->request_freelist[WRITE])) { - list = &q->request_freelist[WRITE]; - rq = blkdev_free_rq(list); - goto got_rq; + if (!list_empty(&q->request_freelist[rw])) { + rq = blkdev_free_rq(&q->request_freelist[rw]); + list_del(&rq->table); + rq->rq_status = RQ_ACTIVE; + rq->special = NULL; + rq->q = q; } - return NULL; - -got_rq: - list_del(&rq->table); - rq->free_list = list; - rq->rq_status = RQ_ACTIVE; - rq->special = NULL; - rq->q = q; return rq; } @@ -581,25 +573,29 @@ /* * add-request adds a request to the linked list. - * It disables interrupts (acquires the request spinlock) so that it can muck - * with the request-lists in peace. Thus it should be called with no spinlocks - * held. + * io_request_lock is held and interrupts disabled, as we muck with the + * request queue list. * * By this point, req->cmd is always either READ/WRITE, never READA, * which is important for drive_stat_acct() above. */ - static inline void add_request(request_queue_t * q, struct request * req, - struct list_head *head, int lat) + struct list_head *insert_here) { int major; drive_stat_acct(req->rq_dev, req->cmd, req->nr_sectors, 1); + if (!q->plugged && q->head_active && insert_here == &q->queue_head) { + spin_unlock_irq(&io_request_lock); + BUG(); + } + /* - * let selected elevator insert the request + * elevator indicated where it wants this request to be + * inserted at elevator_merge time */ - q->elevator.elevator_fn(req, &q->elevator, &q->queue_head, head, lat); + list_add(&req->queue, insert_here); /* * FIXME(eric) I don't understand why there is a need for this @@ -617,20 +613,55 @@ (q->request_fn)(q); } +void inline blk_refill_freelist(request_queue_t *q, int rw) +{ + if (q->pending_free[rw]) { + list_splice(&q->pending_freelist[rw], &q->request_freelist[rw]); + INIT_LIST_HEAD(&q->pending_freelist[rw]); + q->pending_free[rw] = 0; + } +} + /* * Must be called with io_request_lock held and interrupts disabled */ void inline blkdev_release_request(struct request *req) { + request_queue_t *q = req->q; + int rw = req->cmd; + req->rq_status = RQ_INACTIVE; + req->q = NULL; /* - * Request may not have originated from ll_rw_blk + * Request may not have originated from ll_rw_blk. if not, + * asumme it has free buffers and check waiters */ - if (req->free_list) { - list_add(&req->table, req->free_list); - req->free_list = NULL; - wake_up(&req->q->wait_for_request); + if (q) { + /* + * we've released enough buffers to start I/O again + */ + if (waitqueue_active(&blk_buffers_wait) + && atomic_read(&queued_sectors) < low_queued_sectors) + wake_up(&blk_buffers_wait); + + if (!list_empty(&q->request_freelist[rw])) { + blk_refill_freelist(q, rw); + list_add(&req->table, &q->request_freelist[rw]); + return; + } + + /* + * free list is empty, add to pending free list and + * batch wakeups + */ + list_add(&req->table, &q->pending_freelist[rw]); + + if (++q->pending_free[rw] >= batch_requests) { + int wake_up = q->pending_free[rw]; + blk_refill_freelist(q, rw); + wake_up_nr(&q->wait_for_request, wake_up); + } } } @@ -658,9 +689,10 @@ * will have been updated to the appropriate number, * and we shouldn't do it here too. */ - if(!(q->merge_requests_fn)(q, req, next, max_segments)) + if (!q->merge_requests_fn(q, req, next, max_segments)) return; + q->elevator.elevator_merge_req_fn(req, next); req->bhtail->b_reqnext = next->bh; req->bhtail = next->bhtail; req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors; @@ -699,7 +731,7 @@ int max_segments = MAX_SEGMENTS; struct request * req = NULL, *freereq = NULL; int rw_ahead, max_sectors, el_ret; - struct list_head *head; + struct list_head *head, *insert_here; int latency; elevator_t *elevator = &q->elevator; @@ -713,6 +745,7 @@ rw = READ; /* drop into READ */ case READ: case WRITE: + latency = elevator_request_latency(elevator, rw); break; default: BUG(); @@ -741,38 +774,33 @@ */ max_sectors = get_max_sectors(bh->b_rdev); - latency = elevator_request_latency(elevator, rw); - +again: + head = &q->queue_head; /* * Now we acquire the request spinlock, we have to be mega careful * not to schedule or do something nonatomic */ -again: spin_lock_irq(&io_request_lock); - /* - * skip first entry, for devices with active queue head - */ - head = &q->queue_head; - if (q->head_active && !q->plugged) - head = head->next; - + insert_here = head->prev; if (list_empty(head)) { q->plug_device_fn(q, bh->b_rdev); /* is atomic */ goto get_rq; - } + } else if (q->head_active && !q->plugged) + head = head->next; - el_ret = elevator->elevator_merge_fn(q, &req, bh, rw, - &max_sectors, &max_segments); + el_ret = elevator->elevator_merge_fn(q, &req, head, bh, rw, + max_sectors, max_segments); switch (el_ret) { case ELEVATOR_BACK_MERGE: if (!q->back_merge_fn(q, req, bh, max_segments)) break; + elevator->elevator_merge_cleanup_fn(q, req, count); req->bhtail->b_reqnext = bh; req->bhtail = bh; req->nr_sectors = req->hard_nr_sectors += count; - req->e = elevator; + blk_started_io(count); drive_stat_acct(req->rq_dev, req->cmd, count, 0); attempt_back_merge(q, req, max_sectors, max_segments); goto out; @@ -780,20 +808,29 @@ case ELEVATOR_FRONT_MERGE: if (!q->front_merge_fn(q, req, bh, max_segments)) break; + elevator->elevator_merge_cleanup_fn(q, req, count); bh->b_reqnext = req->bh; req->bh = bh; req->buffer = bh->b_data; req->current_nr_sectors = count; req->sector = req->hard_sector = sector; req->nr_sectors = req->hard_nr_sectors += count; - req->e = elevator; + blk_started_io(count); drive_stat_acct(req->rq_dev, req->cmd, count, 0); attempt_front_merge(q, head, req, max_sectors, max_segments); goto out; + /* * elevator says don't/can't merge. get new request */ case ELEVATOR_NO_MERGE: + /* + * use elevator hints as to where to insert the + * request. if no hints, just add it to the back + * of the queue + */ + if (req) + insert_here = &req->queue; break; default: @@ -802,10 +839,9 @@ } /* - * Grab a free request from the freelist. Read first try their - * own queue - if that is empty, we steal from the write list. - * Writes must block if the write list is empty, and read aheads - * are not crucial. + * Grab a free request from the freelist - if that is empty, check + * if we are doing read ahead and abort instead of blocking for + * a free slot. */ get_rq: if (freereq) { @@ -821,6 +857,7 @@ } /* fill up the request-info, and add it to the queue */ + req->elevator_sequence = latency; req->cmd = rw; req->errors = 0; req->hard_sector = req->sector = sector; @@ -833,13 +870,13 @@ req->bh = bh; req->bhtail = bh; req->rq_dev = bh->b_rdev; - req->e = elevator; - add_request(q, req, head, latency); + blk_started_io(count); + add_request(q, req, insert_here); out: - if (!q->plugged) - (q->request_fn)(q); if (freereq) blkdev_release_request(freereq); + if (!q->plugged) + q->request_fn(q); spin_unlock_irq(&io_request_lock); return 0; end_io: @@ -886,13 +923,13 @@ int major = MAJOR(bh->b_rdev); request_queue_t *q; - if (!bh->b_end_io) BUG(); + if (!bh->b_end_io) + BUG(); + if (blk_size[major]) { unsigned long maxsector = (blk_size[major][MINOR(bh->b_rdev)] << 1) + 1; - unsigned int sector, count; - - count = bh->b_size >> 9; - sector = bh->b_rsector; + unsigned long sector = bh->b_rsector; + unsigned int count = bh->b_size >> 9; if (maxsector < count || maxsector - count < sector) { bh->b_state &= (1 << BH_Lock) | (1 << BH_Mapped); @@ -903,7 +940,7 @@ when mounting a device. */ printk(KERN_INFO "attempt to access beyond end of device\n"); - printk(KERN_INFO "%s: rw=%d, want=%d, limit=%d\n", + printk(KERN_INFO "%s: rw=%d, want=%ld, limit=%d\n", kdevname(bh->b_rdev), rw, (sector + count)>>1, blk_size[major][MINOR(bh->b_rdev)]); @@ -930,15 +967,13 @@ buffer_IO_error(bh); break; } - - } - while (q->make_request_fn(q, rw, bh)); + } while (q->make_request_fn(q, rw, bh)); } /** * submit_bh: submit a buffer_head to the block device later for I/O - * @rw: whether to %READ or %WRITE, or mayve to %READA (read ahead) + * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead) * @bh: The &struct buffer_head which describes the I/O * * submit_bh() is very similar in purpose to generic_make_request(), and @@ -961,7 +996,7 @@ * further remap this. */ bh->b_rdev = bh->b_dev; - bh->b_rsector = bh->b_blocknr * (bh->b_size>>9); + bh->b_rsector = bh->b_blocknr * (bh->b_size >> 9); generic_make_request(rw, bh); @@ -1021,6 +1056,9 @@ int correct_size; int i; + if (!nr) + return; + major = MAJOR(bhs[0]->b_dev); /* Determine correct block size for this device. */ @@ -1033,9 +1071,8 @@ /* Verify requested block sizes. */ for (i = 0; i < nr; i++) { - struct buffer_head *bh; - bh = bhs[i]; - if (bh->b_size != correct_size) { + struct buffer_head *bh = bhs[i]; + if (bh->b_size % correct_size) { printk(KERN_NOTICE "ll_rw_block: device %s: " "only %d-char blocks implemented (%u)\n", kdevname(bhs[0]->b_dev), @@ -1051,8 +1088,17 @@ } for (i = 0; i < nr; i++) { - struct buffer_head *bh; - bh = bhs[i]; + struct buffer_head *bh = bhs[i]; + + /* + * don't lock any more buffers if we are above the high + * water mark. instead start I/O on the queued stuff. + */ + if (atomic_read(&queued_sectors) >= high_queued_sectors) { + run_task_queue(&tq_disk); + wait_event(blk_buffers_wait, + atomic_read(&queued_sectors) < low_queued_sectors); + } /* Only one thread can actually submit the I/O. */ if (test_and_set_bit(BH_Lock, &bh->b_state)) @@ -1115,6 +1161,7 @@ if ((bh = req->bh) != NULL) { nsect = bh->b_size >> 9; + blk_finished_io(nsect); req->bh = bh->b_reqnext; bh->b_reqnext = NULL; bh->b_end_io(bh, uptodate); @@ -1138,19 +1185,18 @@ void end_that_request_last(struct request *req) { - if (req->e) { - printk("end_that_request_last called with non-dequeued req\n"); - BUG(); - } if (req->sem != NULL) up(req->sem); blkdev_release_request(req); } +#define MB(kb) ((kb) << 10) + int __init blk_dev_init(void) { struct blk_dev_struct *dev; + int total_ram; request_cachep = kmem_cache_create("blkdev_requests", sizeof(struct request), @@ -1165,6 +1211,51 @@ memset(ro_bits,0,sizeof(ro_bits)); memset(max_readahead, 0, sizeof(max_readahead)); memset(max_sectors, 0, sizeof(max_sectors)); + + atomic_set(&queued_sectors, 0); + total_ram = nr_free_pages() << (PAGE_SHIFT - 10); + + /* + * Try to keep 128MB max hysteris. If not possible, + * use half of RAM + */ + high_queued_sectors = (total_ram * 2) / 3; + low_queued_sectors = high_queued_sectors - MB(128); + if (low_queued_sectors < 0) + low_queued_sectors = total_ram / 2; + + /* + * for big RAM machines (>= 384MB), use more for I/O + */ + if (total_ram >= MB(384)) { + high_queued_sectors = (total_ram * 4) / 5; + low_queued_sectors = high_queued_sectors - MB(128); + } + + /* + * make it sectors (512b) + */ + high_queued_sectors <<= 1; + low_queued_sectors <<= 1; + + /* + * Scale free request slots per queue too + */ + total_ram = (total_ram + MB(32) - 1) & ~(MB(32) - 1); + if ((queue_nr_requests = total_ram >> 9) > QUEUE_NR_REQUESTS) + queue_nr_requests = QUEUE_NR_REQUESTS; + + /* + * adjust batch frees according to queue length, with upper limit + */ + if ((batch_requests = queue_nr_requests >> 3) > 32) + batch_requests = 32; + + printk("block: queued sectors max/low %dkB/%dkB, %d slots per queue\n", + high_queued_sectors / 2, + low_queued_sectors / 2, + queue_nr_requests); + #ifdef CONFIG_AMIGA_Z2RAM z2_init(); #endif @@ -1279,9 +1370,12 @@ EXPORT_SYMBOL(end_that_request_last); EXPORT_SYMBOL(blk_init_queue); EXPORT_SYMBOL(blk_get_queue); +EXPORT_SYMBOL(__blk_get_queue); EXPORT_SYMBOL(blk_cleanup_queue); EXPORT_SYMBOL(blk_queue_headactive); EXPORT_SYMBOL(blk_queue_pluggable); EXPORT_SYMBOL(blk_queue_make_request); EXPORT_SYMBOL(generic_make_request); EXPORT_SYMBOL(blkdev_release_request); +EXPORT_SYMBOL(generic_unplug_device); +EXPORT_SYMBOL(queued_sectors); diff -u --recursive --new-file v2.4.0/linux/drivers/block/paride/pd.c linux/drivers/block/paride/pd.c --- v2.4.0/linux/drivers/block/paride/pd.c Thu Oct 26 23:35:47 2000 +++ linux/drivers/block/paride/pd.c Mon Jan 15 13:08:15 2001 @@ -392,7 +392,6 @@ if (req->nr_segments < max_segments) { req->nr_segments++; - q->elevator.nr_segments++; return 1; } return 0; @@ -432,7 +431,6 @@ if (total_segments > max_segments) return 0; - q->elevator.nr_segments -= same_segment; req->nr_segments = total_segments; return 1; } diff -u --recursive --new-file v2.4.0/linux/drivers/block/paride/pf.c linux/drivers/block/paride/pf.c --- v2.4.0/linux/drivers/block/paride/pf.c Thu Oct 26 23:35:47 2000 +++ linux/drivers/block/paride/pf.c Mon Jan 15 13:08:15 2001 @@ -346,7 +346,6 @@ if (req->nr_segments < max_segments) { req->nr_segments++; - q->elevator.nr_segments++; return 1; } return 0; @@ -386,7 +385,6 @@ if (total_segments > max_segments) return 0; - q->elevator.nr_segments -= same_segment; req->nr_segments = total_segments; return 1; } diff -u --recursive --new-file v2.4.0/linux/drivers/char/drm/Config.in linux/drivers/char/drm/Config.in --- v2.4.0/linux/drivers/char/drm/Config.in Tue Aug 8 09:27:33 2000 +++ linux/drivers/char/drm/Config.in Mon Jan 15 11:08:13 2001 @@ -9,7 +9,7 @@ if [ "$CONFIG_DRM" != "n" ]; then tristate ' 3dfx Banshee/Voodoo3+' CONFIG_DRM_TDFX tristate ' 3dlabs GMX 2000' CONFIG_DRM_GAMMA - tristate ' ATI Rage 128' CONFIG_DRM_R128 + dep_tristate ' ATI Rage 128' CONFIG_DRM_R128 $CONFIG_AGP dep_tristate ' Intel I810' CONFIG_DRM_I810 $CONFIG_AGP dep_tristate ' Matrox g200/g400' CONFIG_DRM_MGA $CONFIG_AGP fi diff -u --recursive --new-file v2.4.0/linux/drivers/i2o/i2o_block.c linux/drivers/i2o/i2o_block.c --- v2.4.0/linux/drivers/i2o/i2o_block.c Wed Nov 8 17:09:50 2000 +++ linux/drivers/i2o/i2o_block.c Mon Jan 15 13:08:15 2001 @@ -392,7 +392,6 @@ if (req->nr_segments < max_segments) { req->nr_segments++; - q->elevator.nr_segments++; return 1; } return 0; @@ -436,7 +435,6 @@ if (total_segments > max_segments) return 0; - q->elevator.nr_segments -= same_segment; req->nr_segments = total_segments; return 1; } diff -u --recursive --new-file v2.4.0/linux/drivers/ide/ide-dma.c linux/drivers/ide/ide-dma.c --- v2.4.0/linux/drivers/ide/ide-dma.c Tue Jan 2 16:58:45 2001 +++ linux/drivers/ide/ide-dma.c Mon Jan 15 13:08:15 2001 @@ -226,6 +226,9 @@ unsigned char *virt_addr = bh->b_data; unsigned int size = bh->b_size; + if (nents >= PRD_ENTRIES) + return 0; + while ((bh = bh->b_reqnext) != NULL) { if ((virt_addr + size) != (unsigned char *) bh->b_data) break; @@ -259,6 +262,9 @@ HWIF(drive)->sg_nents = i = ide_build_sglist(HWIF(drive), HWGROUP(drive)->rq); + if (!i) + return 0; + sg = HWIF(drive)->sg_table; while (i && sg_dma_len(sg)) { u32 cur_addr; @@ -274,7 +280,7 @@ */ while (cur_len) { - if (++count >= PRD_ENTRIES) { + if (count++ >= PRD_ENTRIES) { printk("%s: DMA table too small\n", drive->name); pci_unmap_sg(HWIF(drive)->pci_dev, HWIF(drive)->sg_table, diff -u --recursive --new-file v2.4.0/linux/drivers/ide/ide-probe.c linux/drivers/ide/ide-probe.c --- v2.4.0/linux/drivers/ide/ide-probe.c Thu Oct 26 23:35:48 2000 +++ linux/drivers/ide/ide-probe.c Mon Jan 15 13:08:15 2001 @@ -134,7 +134,7 @@ break; } #endif - printk ("CDROM"); + printk ("CD/DVD-ROM"); break; case ide_tape: printk ("TAPE"); @@ -761,9 +761,10 @@ for (unit = 0; unit < minors; ++unit) { *bs++ = BLOCK_SIZE; #ifdef CONFIG_BLK_DEV_PDC4030 - *max_sect++ = ((hwif->chipset == ide_pdc4030) ? 127 : MAX_SECTORS); + *max_sect++ = ((hwif->chipset == ide_pdc4030) ? 127 : 256); #else - *max_sect++ = MAX_SECTORS; + /* IDE can do up to 128K per request. */ + *max_sect++ = 256; #endif *max_ra++ = MAX_READAHEAD; } diff -u --recursive --new-file v2.4.0/linux/drivers/isdn/hisax/Makefile linux/drivers/isdn/hisax/Makefile --- v2.4.0/linux/drivers/isdn/hisax/Makefile Fri Dec 29 14:40:54 2000 +++ linux/drivers/isdn/hisax/Makefile Mon Jan 8 15:06:01 2001 @@ -34,8 +34,8 @@ hisax-objs-$(CONFIG_HISAX_ASUSCOM) += asuscom.o isac.o arcofi.o hscx.o hisax-objs-$(CONFIG_HISAX_TELEINT) += teleint.o isac.o arcofi.o hfc_2bs0.o hisax-objs-$(CONFIG_HISAX_SEDLBAUER) += sedlbauer.o isac.o arcofi.o hscx.o isar.o -hisax-objs-$(CONFIG_HISAX_SPORTSTER) += sportster.o isac.o arcofi.o hfc_2bs0.o -hisax-objs-$(CONFIG_HISAX_MIC) += mic.o isac.o arcofi.o hfc_2bs0.o +hisax-objs-$(CONFIG_HISAX_SPORTSTER) += sportster.o isac.o arcofi.o hscx.o +hisax-objs-$(CONFIG_HISAX_MIC) += mic.o isac.o arcofi.o hscx.o hisax-objs-$(CONFIG_HISAX_NETJET) += nj_s.o netjet.o isac.o arcofi.o hisax-objs-$(CONFIG_HISAX_NETJET_U) += nj_u.o netjet.o icc.o hisax-objs-$(CONFIG_HISAX_HFCS) += hfcscard.o hfc_2bds0.o diff -u --recursive --new-file v2.4.0/linux/drivers/isdn/hisax/isdnl3.c linux/drivers/isdn/hisax/isdnl3.c --- v2.4.0/linux/drivers/isdn/hisax/isdnl3.c Tue Nov 28 21:43:13 2000 +++ linux/drivers/isdn/hisax/isdnl3.c Mon Jan 8 15:19:34 2001 @@ -566,7 +566,7 @@ } else { struct sk_buff *skb = arg; - skb_queue_head(&st->l3.squeue, skb); + skb_queue_tail(&st->l3.squeue, skb); FsmEvent(&st->l3.l3m, EV_ESTABLISH_REQ, NULL); } break; diff -u --recursive --new-file v2.4.0/linux/drivers/isdn/hisax/md5sums.asc linux/drivers/isdn/hisax/md5sums.asc --- v2.4.0/linux/drivers/isdn/hisax/md5sums.asc Thu Jan 4 13:20:17 2001 +++ linux/drivers/isdn/hisax/md5sums.asc Wed Jan 10 14:12:53 2001 @@ -10,7 +10,7 @@ ca7bd9bac39203f3074f3f093948cc3c isac.c a2ad619fd404b3149099a2984de9d23c isdnl1.c d2a78e407f3d94876deac160c6f9aae6 isdnl2.c -a109841c2e75b11fc8ef2c8718e24c3e isdnl3.c +e7932ca7ae39c497c17f13a2e1434fcd isdnl3.c afb5f2f4ac296d6de45c856993b161e1 tei.c 00023e2a482cb86a26ea870577ade5d6 callc.c a1834e9b2ec068440cff2e899eff4710 cert.c @@ -25,9 +25,9 @@ Version: 2.6.3i Charset: noconv -iQCVAwUBOlMTgDpxHvX/mS9tAQFSbgP/W9y6tnnWHTRLGqyr3EY1OHZiQXERkAAu -hp+Y8PIoX1GgAh4yZ7xhYwUsk6y0z5USdGuhC9ZHh+oZd57lPsJMnhkEZR5BVsYT -r7jHwelP527+QCLkVUCHIVIWUW0ANzeZBhDV2vefkFb+gWLiZsBhaHssbcKGsMNG -Ak4xS1ByqsM= -=lsIJ +iQCVAwUBOlxeLTpxHvX/mS9tAQH6RwP8DhyvqAnXFV6WIGi16iQ3vKikkPoqnDQs +GEn5uCW0dPYKlwthD2Grj/JbMYZhOmCFuDxF7ufJnjTSDe/D8XNe2wngxzAiwcIe +WjCrT8X95cuP3HZHscbFTEinVV0GAnoI0ZEgs5eBDhVHDqILLYMaTFBQaRH3jgXc +i5VH88jPfUM= +=qc+J -----END PGP SIGNATURE----- diff -u --recursive --new-file v2.4.0/linux/drivers/isdn/isdn_common.c linux/drivers/isdn/isdn_common.c --- v2.4.0/linux/drivers/isdn/isdn_common.c Tue Jan 2 16:45:38 2001 +++ linux/drivers/isdn/isdn_common.c Mon Jan 8 15:06:01 2001 @@ -1512,7 +1512,7 @@ int i; if ((ret = verify_area(VERIFY_READ, (void *) arg, - (ISDN_MODEM_NUMREG + ISDN_MSNLEN) + (ISDN_MODEM_NUMREG + ISDN_MSNLEN + ISDN_LMSNLEN) * ISDN_MAX_CHANNELS))) return ret; @@ -1521,6 +1521,9 @@ ISDN_MODEM_NUMREG)) return -EFAULT; p += ISDN_MODEM_NUMREG; + if (copy_from_user(dev->mdm.info[i].emu.plmsn, p, ISDN_LMSNLEN)) + return -EFAULT; + p += ISDN_LMSNLEN; if (copy_from_user(dev->mdm.info[i].emu.pmsn, p, ISDN_MSNLEN)) return -EFAULT; p += ISDN_MSNLEN; diff -u --recursive --new-file v2.4.0/linux/drivers/isdn/isdn_net.c linux/drivers/isdn/isdn_net.c --- v2.4.0/linux/drivers/isdn/isdn_net.c Fri Dec 29 14:07:22 2000 +++ linux/drivers/isdn/isdn_net.c Mon Jan 8 15:06:01 2001 @@ -2325,6 +2325,7 @@ memset(netdev, 0, sizeof(isdn_net_dev)); if (!(netdev->local = (isdn_net_local *) kmalloc(sizeof(isdn_net_local), GFP_KERNEL))) { printk(KERN_WARNING "isdn_net: Could not allocate device locals\n"); + kfree(netdev); return NULL; } memset(netdev->local, 0, sizeof(isdn_net_local)); diff -u --recursive --new-file v2.4.0/linux/drivers/isdn/isdn_ppp.c linux/drivers/isdn/isdn_ppp.c --- v2.4.0/linux/drivers/isdn/isdn_ppp.c Tue Nov 28 21:43:13 2000 +++ linux/drivers/isdn/isdn_ppp.c Mon Jan 8 15:20:19 2001 @@ -1131,9 +1131,9 @@ proto = PPP_IPX; /* untested */ break; default: - dev_kfree_skb(skb); printk(KERN_ERR "isdn_ppp: skipped unsupported protocol: %#x.\n", skb->protocol); + dev_kfree_skb(skb); return 0; } diff -u --recursive --new-file v2.4.0/linux/drivers/isdn/isdn_v110.c linux/drivers/isdn/isdn_v110.c --- v2.4.0/linux/drivers/isdn/isdn_v110.c Sun Aug 6 12:43:42 2000 +++ linux/drivers/isdn/isdn_v110.c Mon Jan 15 15:31:18 2001 @@ -102,7 +102,7 @@ int i; isdn_v110_stream *v; - if ((v = kmalloc(sizeof(isdn_v110_stream), GFP_KERNEL)) == NULL) + if ((v = kmalloc(sizeof(isdn_v110_stream), GFP_ATOMIC)) == NULL) return NULL; memset(v, 0, sizeof(isdn_v110_stream)); v->key = key; @@ -134,7 +134,7 @@ v->b = 0; v->skbres = hdrlen; v->maxsize = maxsize - hdrlen; - if ((v->encodebuf = kmalloc(maxsize, GFP_KERNEL)) == NULL) { + if ((v->encodebuf = kmalloc(maxsize, GFP_ATOMIC)) == NULL) { kfree(v); return NULL; } diff -u --recursive --new-file v2.4.0/linux/drivers/net/3c59x.c linux/drivers/net/3c59x.c --- v2.4.0/linux/drivers/net/3c59x.c Tue Nov 14 11:34:25 2000 +++ linux/drivers/net/3c59x.c Sat Jan 6 09:27:42 2001 @@ -118,6 +118,14 @@ LK1.1.11 13 Nov 2000 andrewm - Dump MOD_INC/DEC_USE_COUNT, use SET_MODULE_OWNER + LK1.1.12 1 Jan 2001 andrewm + - Call pci_enable_device before we request our IRQ (Tobias Ringstrom) + - Add 3c590 PCI latency timer hack to vortex_probe1 (from 0.99Ra) + - Added extended wait_for_completion for the 3c905CX. + - Look for an MII on PHY index 24 first (3c905CX oddity). + - Add HAS_NWAY to 3cSOHO100-TX (Brett Frankenberger) + - Don't free skbs we don't own on oom path in vortex_open(). + - See http://www.uow.edu.au/~andrewm/linux/#3c59x-2.3 for more details. - Also see Documentation/networking/vortex.txt */ @@ -203,7 +211,7 @@ #include static char version[] __devinitdata = -"3c59x.c:LK1.1.11 13 Nov 2000 Donald Becker and others. http://www.scyld.com/network/vortex.html " "$Revision: 1.102.2.46 $\n"; +"3c59x.c:LK1.1.12 06 Jan 2000 Donald Becker and others. http://www.scyld.com/network/vortex.html " "$Revision: 1.102.2.46 $\n"; MODULE_AUTHOR("Donald Becker "); MODULE_DESCRIPTION("3Com 3c59x/3c90x/3c575 series Vortex/Boomerang/Cyclone driver"); @@ -424,7 +432,7 @@ PCI_USES_IO|PCI_USES_MASTER, IS_CYCLONE, 128, }, {"3cSOHO100-TX Hurricane", - PCI_USES_IO|PCI_USES_MASTER, IS_CYCLONE, 128, }, + PCI_USES_IO|PCI_USES_MASTER, IS_CYCLONE|HAS_NWAY, 128, }, {"3c555 Laptop Hurricane", PCI_USES_IO|PCI_USES_MASTER, IS_CYCLONE|EEPROM_8BIT, 128, }, {"3c556 Laptop Tornado", @@ -843,10 +851,15 @@ { int rc; - rc = vortex_probe1 (pdev, pci_resource_start (pdev, 0), pdev->irq, - ent->driver_data, vortex_cards_found); - if (rc == 0) - vortex_cards_found++; + /* wake up and enable device */ + if (pci_enable_device (pdev)) { + rc = -EIO; + } else { + rc = vortex_probe1 (pdev, pci_resource_start (pdev, 0), pdev->irq, + ent->driver_data, vortex_cards_found); + if (rc == 0) + vortex_cards_found++; + } return rc; } @@ -863,7 +876,7 @@ struct vortex_private *vp; int option; unsigned int eeprom[0x40], checksum = 0; /* EEPROM contents */ - int i; + int i, step; struct net_device *dev; static int printed_version; int retval; @@ -889,7 +902,6 @@ vci->name, ioaddr); - /* private struct aligned and zeroed by init_etherdev */ vp = dev->priv; dev->base_addr = ioaddr; dev->irq = irq; @@ -908,19 +920,29 @@ if (pdev) { /* EISA resources already marked, so only PCI needs to do this here */ /* Ignore return value, because Cardbus drivers already allocate for us */ - if (request_region(ioaddr, vci->io_size, dev->name) != NULL) { + if (request_region(ioaddr, vci->io_size, dev->name) != NULL) vp->must_free_region = 1; - } - - /* wake up and enable device */ - if (pci_enable_device (pdev)) { - retval = -EIO; - goto free_region; - } /* enable bus-mastering if necessary */ if (vci->flags & PCI_USES_MASTER) pci_set_master (pdev); + + if (vci->drv_flags & IS_VORTEX) { + u8 pci_latency; + u8 new_latency = 248; + + /* Check the PCI latency value. On the 3c590 series the latency timer + must be set to the maximum value to avoid data corruption that occurs + when the timer expires during a transfer. This bug exists the Vortex + chip only. */ + pci_read_config_byte(pdev, PCI_LATENCY_TIMER, &pci_latency); + if (pci_latency < new_latency) { + printk(KERN_INFO "%s: Overriding PCI latency" + " timer (CFLT) setting of %d, new value is %d.\n", + dev->name, pci_latency, new_latency); + pci_write_config_byte(pdev, PCI_LATENCY_TIMER, new_latency); + } + } } spin_lock_init(&vp->lock); @@ -1025,6 +1047,13 @@ dev->irq); #endif + EL3WINDOW(4); + step = (inb(ioaddr + Wn4_NetDiag) & 0x1e) >> 1; + printk(KERN_INFO " product code '%c%c' rev %02x.%d date %02d-" + "%02d-%02d\n", eeprom[6]&0xff, eeprom[6]>>8, eeprom[0x14], + step, (eeprom[4]>>5) & 15, eeprom[4] & 31, eeprom[4]>>9); + + if (pdev && vci->drv_flags & HAS_CB_FNS) { unsigned long fn_st_addr; /* Cardbus function status space */ unsigned short n; @@ -1089,8 +1118,19 @@ mii_preamble_required++; mii_preamble_required++; mdio_read(dev, 24, 1); - for (phy = 1; phy <= 32 && phy_idx < sizeof(vp->phys); phy++) { - int mii_status, phyx = phy & 0x1f; + for (phy = 0; phy < 32 && phy_idx < 1; phy++) { + int mii_status, phyx; + + /* + * For the 3c905CX we look at index 24 first, because it bogusly + * reports an external PHY at all indices + */ + if (phy == 0) + phyx = 24; + else if (phy <= 24) + phyx = phy - 1; + else + phyx = phy; mii_status = mdio_read(dev, phyx, 1); if (mii_status && mii_status != 0xffff) { vp->phys[phy_idx++] = phyx; @@ -1135,12 +1175,13 @@ dev->set_multicast_list = set_rx_mode; dev->tx_timeout = vortex_tx_timeout; dev->watchdog_timeo = (watchdog * HZ) / 1000; - +// publish_netdev(dev); return 0; free_region: if (vp->must_free_region) release_region(ioaddr, vci->io_size); +// withdraw_netdev(dev); unregister_netdev(dev); kfree (dev); printk(KERN_ERR PFX "vortex_probe1 fails. Returns %d\n", retval); @@ -1150,13 +1191,23 @@ static void wait_for_completion(struct net_device *dev, int cmd) { - int i = 4000; + int i; outw(cmd, dev->base_addr + EL3_CMD); - while (--i > 0) { + for (i = 0; i < 2000; i++) { if (!(inw(dev->base_addr + EL3_STATUS) & CmdInProgress)) return; } + + /* OK, that didn't work. Do it the slow way. One second */ + for (i = 0; i < 100000; i++) { + if (!(inw(dev->base_addr + EL3_STATUS) & CmdInProgress)) { + printk(KERN_INFO "%s: command 0x%04x took %d usecs! Please tell andrewm@uow.edu.au\n", + dev->name, cmd, i * 10); + return; + } + udelay(10); + } printk(KERN_ERR "%s: command 0x%04x did not complete! Status=0x%x\n", dev->name, cmd, inw(dev->base_addr + EL3_STATUS)); } @@ -1331,6 +1382,7 @@ set_rx_mode(dev); outw(StatsEnable, ioaddr + EL3_CMD); /* Turn on statistics. */ +// wait_for_completion(dev, SetTxStart|0x07ff); outw(RxEnable, ioaddr + EL3_CMD); /* Enable the receiver. */ outw(TxEnable, ioaddr + EL3_CMD); /* Enable transmitter. */ /* Allow status bits to be seen. */ @@ -1384,7 +1436,8 @@ } if (i != RX_RING_SIZE) { int j; - for (j = 0; j < RX_RING_SIZE; j++) { + printk(KERN_EMERG "%s: no memory for rx ring\n", dev->name); + for (j = 0; j < i; j++) { if (vp->rx_skbuff[j]) { dev_kfree_skb(vp->rx_skbuff[j]); vp->rx_skbuff[j] = 0; @@ -1532,7 +1585,10 @@ printk(KERN_ERR "%s: transmit timed out, tx_status %2.2x status %4.4x.\n", dev->name, inb(ioaddr + TxStatus), inw(ioaddr + EL3_STATUS)); - + EL3WINDOW(4); + printk(KERN_ERR " diagnostics: net %04x media %04x dma %8.8x.\n", + inw(ioaddr + Wn4_NetDiag), inw(ioaddr + Wn4_Media), + inl(ioaddr + PktStatus)); /* Slight code bloat to be user friendly. */ if ((inb(ioaddr + TxStatus) & 0x88) == 0x88) printk(KERN_ERR "%s: Transmitter encountered 16 collisions --" @@ -1663,6 +1719,12 @@ dev->name, fifo_diag); /* Adapter failure requires Tx/Rx reset and reinit. */ if (vp->full_bus_master_tx) { + int bus_status = inl(ioaddr + PktStatus); + /* 0x80000000 PCI master abort. */ + /* 0x40000000 PCI target abort. */ + if (vortex_debug) + printk(KERN_ERR "%s: PCI bus error, bus status %8.8x\n", dev->name, bus_status); + /* In this case, blow the card away */ vortex_down(dev); wait_for_completion(dev, TotalReset | 0xff); diff -u --recursive --new-file v2.4.0/linux/drivers/net/Makefile linux/drivers/net/Makefile --- v2.4.0/linux/drivers/net/Makefile Thu Jan 4 13:00:55 2001 +++ linux/drivers/net/Makefile Sat Jan 6 19:45:14 2001 @@ -26,7 +26,7 @@ obj-$(CONFIG_ISDN) += slhc.o endif -subdir-$(CONFIG_PCMCIA) += pcmcia +subdir-$(CONFIG_NET_PCMCIA) += pcmcia subdir-$(CONFIG_TULIP) += tulip subdir-$(CONFIG_IRDA) += irda subdir-$(CONFIG_TR) += tokenring diff -u --recursive --new-file v2.4.0/linux/drivers/net/depca.c linux/drivers/net/depca.c --- v2.4.0/linux/drivers/net/depca.c Mon Oct 23 15:51:36 2000 +++ linux/drivers/net/depca.c Mon Jan 8 09:09:36 2001 @@ -1817,7 +1817,9 @@ ManCode[5]='\0'; for (i=0;(*signatures[i] != '\0') && (*name == '\0');i++) { - if (strstr(ManCode, signatures[i]) != NULL) { + const char * volatile lhs = ManCode; + const char * volatile rhs = signatures[i]; /* egcs-1.1.2 bug */ + if (strstr(lhs, rhs) != NULL) { strcpy(name,ManCode); status = 1; } diff -u --recursive --new-file v2.4.0/linux/drivers/net/dmfe.c linux/drivers/net/dmfe.c --- v2.4.0/linux/drivers/net/dmfe.c Tue Dec 5 12:29:38 2000 +++ linux/drivers/net/dmfe.c Mon Jan 8 09:09:36 2001 @@ -1596,10 +1596,10 @@ break; } - rc = pci_register_driver(&dmfe_driver); + rc = pci_module_init(&dmfe_driver); if (rc < 0) return rc; - if (rc > 0) { + if (rc >= 0) { printk (KERN_INFO "Davicom DM91xx net driver loaded, version " DMFE_VERSION "\n"); return 0; diff -u --recursive --new-file v2.4.0/linux/drivers/net/ppp_async.c linux/drivers/net/ppp_async.c --- v2.4.0/linux/drivers/net/ppp_async.c Fri Apr 21 13:31:10 2000 +++ linux/drivers/net/ppp_async.c Mon Jan 15 11:04:57 2001 @@ -33,13 +33,6 @@ #include #include -#ifndef spin_trylock_bh -#define spin_trylock_bh(lock) ({ int __r; local_bh_disable(); \ - __r = spin_trylock(lock); \ - if (!__r) local_bh_enable(); \ - __r; }) -#endif - #define PPP_VERSION "2.4.1" #define OBUFSIZE 256 @@ -76,6 +69,7 @@ /* Bit numbers in xmit_flags */ #define XMIT_WAKEUP 0 #define XMIT_FULL 1 +#define XMIT_BUSY 2 /* State bits */ #define SC_TOSS 0x20000000 @@ -181,18 +175,14 @@ } /* - * Read does nothing. + * Read does nothing - no data is ever available this way. + * Pppd reads and writes packets via /dev/ppp instead. */ static ssize_t ppp_asynctty_read(struct tty_struct *tty, struct file *file, unsigned char *buf, size_t count) { - /* For now, do the same as the old 2.3.x code useta */ - struct asyncppp *ap = tty->disc_data; - - if (ap == 0) - return -ENXIO; - return ppp_channel_read(&ap->chan, file, buf, count); + return -EAGAIN; } /* @@ -203,12 +193,7 @@ ppp_asynctty_write(struct tty_struct *tty, struct file *file, const unsigned char *buf, size_t count) { - /* For now, do the same as the old 2.3.x code useta */ - struct asyncppp *ap = tty->disc_data; - - if (ap == 0) - return -ENXIO; - return ppp_channel_write(&ap->chan, buf, count); + return -EAGAIN; } static int @@ -259,25 +244,6 @@ err = 0; break; -/* - * For now, do the same as the old 2.3 driver useta - */ - case PPPIOCGFLAGS: - case PPPIOCSFLAGS: - case PPPIOCGASYNCMAP: - case PPPIOCSASYNCMAP: - case PPPIOCGRASYNCMAP: - case PPPIOCSRASYNCMAP: - case PPPIOCGXASYNCMAP: - case PPPIOCSXASYNCMAP: - case PPPIOCGMRU: - case PPPIOCSMRU: - err = -EPERM; - if (!capable(CAP_NET_ADMIN)) - break; - err = ppp_async_ioctl(&ap->chan, cmd, arg); - break; - case PPPIOCATTACH: case PPPIOCDETACH: err = ppp_channel_ioctl(&ap->chan, cmd, arg); @@ -294,18 +260,7 @@ static unsigned int ppp_asynctty_poll(struct tty_struct *tty, struct file *file, poll_table *wait) { - unsigned int mask; - struct asyncppp *ap = tty->disc_data; - - mask = POLLOUT | POLLWRNORM; -/* - * For now, do the same as the old 2.3 driver useta - */ - if (ap != 0) - mask |= ppp_channel_poll(&ap->chan, file, wait); - if (test_bit(TTY_OTHER_CLOSED, &tty->flags) || tty_hung_up_p(file)) - mask |= POLLHUP; - return mask; + return 0; } static int @@ -637,8 +592,18 @@ int tty_stuffed = 0; set_bit(XMIT_WAKEUP, &ap->xmit_flags); - if (!spin_trylock_bh(&ap->xmit_lock)) + /* + * We can get called recursively here if the tty write + * function calls our wakeup function. This can happen + * for example on a pty with both the master and slave + * set to PPP line discipline. + * We use the XMIT_BUSY bit to detect this and get out, + * leaving the XMIT_WAKEUP bit set to tell the other + * instance that it may now be able to write more now. + */ + if (test_and_set_bit(XMIT_BUSY, &ap->xmit_flags)) return 0; + spin_lock_bh(&ap->xmit_lock); for (;;) { if (test_and_clear_bit(XMIT_WAKEUP, &ap->xmit_flags)) tty_stuffed = 0; @@ -653,7 +618,7 @@ tty_stuffed = 1; continue; } - if (ap->optr == ap->olim && ap->tpkt != 0) { + if (ap->optr >= ap->olim && ap->tpkt != 0) { if (ppp_async_encode(ap)) { /* finished processing ap->tpkt */ clear_bit(XMIT_FULL, &ap->xmit_flags); @@ -661,17 +626,29 @@ } continue; } - /* haven't made any progress */ - spin_unlock_bh(&ap->xmit_lock); + /* + * We haven't made any progress this time around. + * Clear XMIT_BUSY to let other callers in, but + * after doing so we have to check if anyone set + * XMIT_WAKEUP since we last checked it. If they + * did, we should try again to set XMIT_BUSY and go + * around again in case XMIT_BUSY was still set when + * the other caller tried. + */ + clear_bit(XMIT_BUSY, &ap->xmit_flags); + /* any more work to do? if not, exit the loop */ if (!(test_bit(XMIT_WAKEUP, &ap->xmit_flags) || (!tty_stuffed && ap->tpkt != 0))) break; - if (!spin_trylock_bh(&ap->xmit_lock)) + /* more work to do, see if we can do it now */ + if (test_and_set_bit(XMIT_BUSY, &ap->xmit_flags)) break; } + spin_unlock_bh(&ap->xmit_lock); return done; flush: + clear_bit(XMIT_BUSY, &ap->xmit_flags); if (ap->tpkt != 0) { kfree_skb(ap->tpkt); ap->tpkt = 0; diff -u --recursive --new-file v2.4.0/linux/drivers/s390/block/dasd.c linux/drivers/s390/block/dasd.c --- v2.4.0/linux/drivers/s390/block/dasd.c Thu Oct 26 23:35:48 2000 +++ linux/drivers/s390/block/dasd.c Mon Jan 15 13:08:15 2001 @@ -952,7 +952,6 @@ go = 1; while (go && !list_empty(&queue->queue_head)) { req = blkdev_entry_next_request(&queue->queue_head); - req = blkdev_entry_next_request(&queue->queue_head); di = DEVICE_NR (req->rq_dev); dasd_debug ((unsigned long) req); /* req */ dasd_debug (0xc4d90000 + /* DR## */ diff -u --recursive --new-file v2.4.0/linux/drivers/scsi/constants.c linux/drivers/scsi/constants.c --- v2.4.0/linux/drivers/scsi/constants.c Mon Mar 13 22:15:03 2000 +++ linux/drivers/scsi/constants.c Mon Jan 15 13:08:15 2001 @@ -776,7 +776,7 @@ printk("%s%s: sns = %2x %2x\n", devclass, kdevname(dev), sense_buffer[0], sense_buffer[2]); - printk("Non-extended sense class %d code 0x%0x ", sense_class, code); + printk("Non-extended sense class %d code 0x%0x\n", sense_class, code); s = 4; } diff -u --recursive --new-file v2.4.0/linux/drivers/scsi/megaraid.c linux/drivers/scsi/megaraid.c --- v2.4.0/linux/drivers/scsi/megaraid.c Wed Dec 6 12:06:18 2000 +++ linux/drivers/scsi/megaraid.c Tue Jan 9 10:40:43 2001 @@ -149,7 +149,6 @@ #include #ifdef MODULE -#include #include char kernel_version[] = UTS_RELEASE; diff -u --recursive --new-file v2.4.0/linux/drivers/scsi/ppa.c linux/drivers/scsi/ppa.c --- v2.4.0/linux/drivers/scsi/ppa.c Thu Jan 4 13:00:55 2001 +++ linux/drivers/scsi/ppa.c Tue Jan 9 10:40:03 2001 @@ -222,8 +222,8 @@ printk(" supported by the imm (ZIP Plus) driver. If the\n"); printk(" cable is marked with \"AutoDetect\", this is what has\n"); printk(" happened.\n"); - return 0; spin_lock_irq(&io_request_lock); + return 0; } try_again = 1; goto retry_entry; diff -u --recursive --new-file v2.4.0/linux/drivers/scsi/scsi_lib.c linux/drivers/scsi/scsi_lib.c --- v2.4.0/linux/drivers/scsi/scsi_lib.c Sun Sep 17 10:09:29 2000 +++ linux/drivers/scsi/scsi_lib.c Mon Jan 15 16:52:57 2001 @@ -50,6 +50,50 @@ * This entire source file deals with the new queueing code. */ +/* + * Function: __scsi_insert_special() + * + * Purpose: worker for scsi_insert_special_*() + * + * Arguments: q - request queue where request should be inserted + * rq - request to be inserted + * data - private data + * at_head - insert request at head or tail of queue + * + * Lock status: Assumed that io_request_lock is not held upon entry. + * + * Returns: Nothing + */ +static void __scsi_insert_special(request_queue_t *q, struct request *rq, + void *data, int at_head) +{ + unsigned long flags; + + ASSERT_LOCK(&io_request_lock, 0); + + rq->cmd = SPECIAL; + rq->special = data; + rq->q = NULL; + rq->nr_segments = 0; + rq->elevator_sequence = 0; + + /* + * We have the option of inserting the head or the tail of the queue. + * Typically we use the tail for new ioctls and so forth. We use the + * head of the queue for things like a QUEUE_FULL message from a + * device, or a host that is unable to accept a particular command. + */ + spin_lock_irqsave(&io_request_lock, flags); + + if (at_head) + list_add(&rq->queue, &q->queue_head); + else + list_add_tail(&rq->queue, &q->queue_head); + + q->request_fn(q); + spin_unlock_irqrestore(&io_request_lock, flags); +} + /* * Function: scsi_insert_special_cmd() @@ -73,52 +117,9 @@ */ int scsi_insert_special_cmd(Scsi_Cmnd * SCpnt, int at_head) { - unsigned long flags; - request_queue_t *q; - - ASSERT_LOCK(&io_request_lock, 0); - - /* - * The SCpnt already contains a request structure - we will doctor the - * thing up with the appropriate values and use that in the actual - * request queue. - */ - q = &SCpnt->device->request_queue; - SCpnt->request.cmd = SPECIAL; - SCpnt->request.special = (void *) SCpnt; - SCpnt->request.q = NULL; - SCpnt->request.free_list = NULL; - SCpnt->request.nr_segments = 0; - - /* - * We have the option of inserting the head or the tail of the queue. - * Typically we use the tail for new ioctls and so forth. We use the - * head of the queue for things like a QUEUE_FULL message from a - * device, or a host that is unable to accept a particular command. - */ - spin_lock_irqsave(&io_request_lock, flags); - - if (at_head) { - list_add(&SCpnt->request.queue, &q->queue_head); - } else { - /* - * FIXME(eric) - we always insert at the tail of the - * list. Otherwise ioctl commands would always take - * precedence over normal I/O. An ioctl on a busy - * disk might be delayed indefinitely because the - * request might not float high enough in the queue - * to be scheduled. - */ - list_add_tail(&SCpnt->request.queue, &q->queue_head); - } + request_queue_t *q = &SCpnt->device->request_queue; - /* - * Now hit the requeue function for the queue. If the host is - * already busy, so be it - we have nothing special to do. If - * the host can queue it, then send it off. - */ - q->request_fn(q); - spin_unlock_irqrestore(&io_request_lock, flags); + __scsi_insert_special(q, &SCpnt->request, SCpnt, at_head); return 0; } @@ -144,51 +145,9 @@ */ int scsi_insert_special_req(Scsi_Request * SRpnt, int at_head) { - unsigned long flags; - request_queue_t *q; - - ASSERT_LOCK(&io_request_lock, 0); - - /* - * The SCpnt already contains a request structure - we will doctor the - * thing up with the appropriate values and use that in the actual - * request queue. - */ - q = &SRpnt->sr_device->request_queue; - SRpnt->sr_request.cmd = SPECIAL; - SRpnt->sr_request.special = (void *) SRpnt; - SRpnt->sr_request.q = NULL; - SRpnt->sr_request.nr_segments = 0; - - /* - * We have the option of inserting the head or the tail of the queue. - * Typically we use the tail for new ioctls and so forth. We use the - * head of the queue for things like a QUEUE_FULL message from a - * device, or a host that is unable to accept a particular command. - */ - spin_lock_irqsave(&io_request_lock, flags); + request_queue_t *q = &SRpnt->sr_device->request_queue; - if (at_head) { - list_add(&SRpnt->sr_request.queue, &q->queue_head); - } else { - /* - * FIXME(eric) - we always insert at the tail of the - * list. Otherwise ioctl commands would always take - * precedence over normal I/O. An ioctl on a busy - * disk might be delayed indefinitely because the - * request might not float high enough in the queue - * to be scheduled. - */ - list_add_tail(&SRpnt->sr_request.queue, &q->queue_head); - } - - /* - * Now hit the requeue function for the queue. If the host is - * already busy, so be it - we have nothing special to do. If - * the host can queue it, then send it off. - */ - q->request_fn(q); - spin_unlock_irqrestore(&io_request_lock, flags); + __scsi_insert_special(q, &SRpnt->sr_request, SRpnt, at_head); return 0; } @@ -403,6 +362,7 @@ struct request *req; struct buffer_head *bh; Scsi_Device * SDpnt; + int nsect; ASSERT_LOCK(&io_request_lock, 0); @@ -414,11 +374,13 @@ } do { if ((bh = req->bh) != NULL) { + nsect = bh->b_size >> 9; + blk_finished_io(nsect); req->bh = bh->b_reqnext; - req->nr_sectors -= bh->b_size >> 9; - req->sector += bh->b_size >> 9; + req->nr_sectors -= nsect; + req->sector += nsect; bh->b_reqnext = NULL; - sectors -= bh->b_size >> 9; + sectors -= nsect; bh->b_end_io(bh, uptodate); if ((bh = req->bh) != NULL) { req->current_nr_sectors = bh->b_size >> 9; @@ -863,17 +825,6 @@ SHpnt = SDpnt->host; /* - * If the host for this device is in error recovery mode, don't - * do anything at all here. When the host leaves error recovery - * mode, it will automatically restart things and start queueing - * commands again. Same goes if the queue is actually plugged, - * if the device itself is blocked, or if the host is fully - * occupied. - */ - if (SHpnt->in_recovery || q->plugged) - return; - - /* * To start with, we keep looping until the queue is empty, or until * the host is no longer able to accept any more requests. */ @@ -896,10 +847,11 @@ || (SHpnt->host_blocked) || (SHpnt->host_self_blocked)) { /* - * If we are unable to process any commands at all for this - * device, then we consider it to be starved. What this means - * is that there are no outstanding commands for this device - * and hence we need a little help getting it started again + * If we are unable to process any commands at all for + * this device, then we consider it to be starved. + * What this means is that there are no outstanding + * commands for this device and hence we need a + * little help getting it started again * once the host isn't quite so busy. */ if (SDpnt->device_busy == 0) { @@ -1000,8 +952,8 @@ } /* * If so, we are ready to do something. Bump the count - * while the queue is locked and then break out of the loop. - * Otherwise loop around and try another request. + * while the queue is locked and then break out of the + * loop. Otherwise loop around and try another request. */ if (!SCpnt) { break; @@ -1029,8 +981,9 @@ memcpy(&SCpnt->request, req, sizeof(struct request)); /* - * We have copied the data out of the request block - it is now in - * a field in SCpnt. Release the request block. + * We have copied the data out of the request block - + * it is now in a field in SCpnt. Release the request + * block. */ blkdev_release_request(req); } @@ -1047,12 +1000,14 @@ /* * This will do a couple of things: * 1) Fill in the actual SCSI command. - * 2) Fill in any other upper-level specific fields (timeout). + * 2) Fill in any other upper-level specific fields + * (timeout). * - * If this returns 0, it means that the request failed (reading - * past end of disk, reading offline device, etc). This won't - * actually talk to the device, but some kinds of consistency - * checking may cause the request to be rejected immediately. + * If this returns 0, it means that the request failed + * (reading past end of disk, reading offline device, + * etc). This won't actually talk to the device, but + * some kinds of consistency checking may cause the + * request to be rejected immediately. */ if (STpnt == NULL) { STpnt = scsi_get_request_dev(req); @@ -1103,8 +1058,8 @@ scsi_dispatch_cmd(SCpnt); /* - * Now we need to grab the lock again. We are about to mess with - * the request queue and try to find another command. + * Now we need to grab the lock again. We are about to mess + * with the request queue and try to find another command. */ spin_lock_irq(&io_request_lock); } diff -u --recursive --new-file v2.4.0/linux/drivers/scsi/scsi_merge.c linux/drivers/scsi/scsi_merge.c --- v2.4.0/linux/drivers/scsi/scsi_merge.c Thu Oct 12 11:16:26 2000 +++ linux/drivers/scsi/scsi_merge.c Mon Jan 15 13:08:15 2001 @@ -324,7 +324,6 @@ req->nr_segments >= SHpnt->sg_tablesize) return 0; req->nr_segments++; - q->elevator.nr_segments++; return 1; } @@ -341,11 +340,8 @@ if (req->nr_hw_segments >= SHpnt->sg_tablesize || req->nr_segments >= SHpnt->sg_tablesize) return 0; - if (req->nr_segments >= max_segments) - return 0; req->nr_hw_segments++; req->nr_segments++; - q->elevator.nr_segments++; return 1; } #else @@ -361,7 +357,6 @@ * counter. */ req->nr_segments++; - q->elevator.nr_segments++; return 1; } else { return 0; @@ -417,8 +412,10 @@ SDpnt = (Scsi_Device *) q->queuedata; SHpnt = SDpnt->host; +#ifdef DMA_CHUNK_SIZE if (max_segments > 64) max_segments = 64; +#endif if (use_clustering) { /* @@ -471,8 +468,10 @@ SDpnt = (Scsi_Device *) q->queuedata; SHpnt = SDpnt->host; +#ifdef DMA_CHUNK_SIZE if (max_segments > 64) max_segments = 64; +#endif if (use_clustering) { /* @@ -601,10 +600,10 @@ SDpnt = (Scsi_Device *) q->queuedata; SHpnt = SDpnt->host; +#ifdef DMA_CHUNK_SIZE if (max_segments > 64) max_segments = 64; -#ifdef DMA_CHUNK_SIZE /* If it would not fit into prepared memory space for sg chain, * then don't allow the merge. */ @@ -664,7 +663,6 @@ * This one is OK. Let it go. */ req->nr_segments += next->nr_segments - 1; - q->elevator.nr_segments--; #ifdef DMA_CHUNK_SIZE req->nr_hw_segments += next->nr_hw_segments - 1; #endif diff -u --recursive --new-file v2.4.0/linux/drivers/scsi/sg.c linux/drivers/scsi/sg.c --- v2.4.0/linux/drivers/scsi/sg.c Thu Jan 4 12:50:17 2001 +++ linux/drivers/scsi/sg.c Mon Jan 15 13:08:15 2001 @@ -694,6 +694,7 @@ (void *)SRpnt->sr_buffer, hp->dxfer_len, sg_cmd_done_bh, timeout, SG_DEFAULT_RETRIES); /* dxfer_len overwrites SRpnt->sr_bufflen, hence need for b_malloc_len */ + generic_unplug_device(&SRpnt->sr_device->request_queue); return 0; } diff -u --recursive --new-file v2.4.0/linux/drivers/scsi/sr.c linux/drivers/scsi/sr.c --- v2.4.0/linux/drivers/scsi/sr.c Fri Dec 29 14:07:22 2000 +++ linux/drivers/scsi/sr.c Mon Jan 15 13:08:15 2001 @@ -671,12 +671,14 @@ cmd[3] = cmd[5] = 0; rc = sr_do_ioctl(i, cmd, buffer, 128, 1, SCSI_DATA_READ, NULL); - if (-EINVAL == rc) { - /* failed, drive has'nt this mode page */ + if (rc) { + /* failed, drive doesn't have capabilities mode page */ scsi_CDs[i].cdi.speed = 1; - /* disable speed select, drive probably can't do this either */ - scsi_CDs[i].cdi.mask |= CDC_SELECT_SPEED; + scsi_CDs[i].cdi.mask |= (CDC_CD_R | CDC_CD_RW | CDC_DVD_R | + CDC_DVD | CDC_DVD_RAM | + CDC_SELECT_DISC | CDC_SELECT_SPEED); scsi_free(buffer, 512); + printk("sr%i: scsi-1 drive\n"); return; } n = buffer[3] + 4; diff -u --recursive --new-file v2.4.0/linux/fs/Config.in linux/fs/Config.in --- v2.4.0/linux/fs/Config.in Thu Nov 9 16:04:42 2000 +++ linux/fs/Config.in Mon Jan 15 12:42:32 2001 @@ -8,6 +8,8 @@ tristate 'Kernel automounter support' CONFIG_AUTOFS_FS tristate 'Kernel automounter version 4 support (also supports v3)' CONFIG_AUTOFS4_FS +dep_tristate 'Reiserfs support' CONFIG_REISERFS_FS $CONFIG_EXPERIMENTAL +dep_mbool ' Have reiserfs do extra internal checking' CONFIG_REISERFS_CHECK $CONFIG_REISERFS_FS $CONFIG_EXPERIMENTAL dep_tristate 'ADFS file system support' CONFIG_ADFS_FS $CONFIG_EXPERIMENTAL dep_mbool ' ADFS write support (DANGEROUS)' CONFIG_ADFS_FS_RW $CONFIG_ADFS_FS $CONFIG_EXPERIMENTAL diff -u --recursive --new-file v2.4.0/linux/fs/Makefile linux/fs/Makefile --- v2.4.0/linux/fs/Makefile Fri Dec 29 14:07:23 2000 +++ linux/fs/Makefile Mon Jan 15 12:42:32 2001 @@ -58,6 +58,7 @@ subdir-$(CONFIG_AUTOFS_FS) += autofs subdir-$(CONFIG_AUTOFS4_FS) += autofs4 subdir-$(CONFIG_ADFS_FS) += adfs +subdir-$(CONFIG_REISERFS_FS) += reiserfs subdir-$(CONFIG_DEVPTS_FS) += devpts subdir-$(CONFIG_SUN_OPENPROMFS) += openpromfs diff -u --recursive --new-file v2.4.0/linux/fs/buffer.c linux/fs/buffer.c --- v2.4.0/linux/fs/buffer.c Wed Jan 3 20:45:26 2001 +++ linux/fs/buffer.c Mon Jan 15 12:42:32 2001 @@ -834,6 +834,10 @@ return; } +void set_buffer_async_io(struct buffer_head *bh) { + bh->b_end_io = end_buffer_io_async ; +} + /* * Synchronise all the inode's dirty buffers to the disk. * @@ -1151,7 +1155,7 @@ /* grab the lru lock here to block bdflush. */ spin_lock(&lru_list_lock); write_lock(&hash_table_lock); - if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf)) + if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf) || buffer_protected(buf)) goto in_use; __hash_unlink(buf); remove_inode_queue(buf); @@ -2411,6 +2415,7 @@ loop = 1; goto cleaned_buffers_try_again; } + wakeup_bdflush(0); } return 0; } diff -u --recursive --new-file v2.4.0/linux/fs/exec.c linux/fs/exec.c --- v2.4.0/linux/fs/exec.c Wed Jan 3 20:45:26 2001 +++ linux/fs/exec.c Mon Jan 8 13:31:56 2001 @@ -407,6 +407,7 @@ /* Add it to the list of mm's */ spin_lock(&mmlist_lock); list_add(&mm->mmlist, &init_mm.mmlist); + mmlist_nr++; spin_unlock(&mmlist_lock); task_lock(current); diff -u --recursive --new-file v2.4.0/linux/fs/inode.c linux/fs/inode.c --- v2.4.0/linux/fs/inode.c Fri Dec 29 15:35:42 2000 +++ linux/fs/inode.c Mon Jan 15 18:20:14 2001 @@ -136,6 +136,16 @@ struct super_block * sb = inode->i_sb; if (sb) { + /* Don't do this for I_DIRTY_PAGES - that doesn't actually dirty the inode itself */ + if (flags & (I_DIRTY | I_DIRTY_SYNC)) { + if (sb->s_op && sb->s_op->dirty_inode) + sb->s_op->dirty_inode(inode); + } + + /* avoid the locking if we can */ + if ((inode->i_state & flags) == flags) + return; + spin_lock(&inode_lock); if ((inode->i_state & flags) != flags) { inode->i_state |= flags; @@ -676,7 +686,17 @@ spin_unlock(&inode_lock); clean_inode(inode); - sb->s_op->read_inode(inode); + + /* reiserfs specific hack right here. We don't + ** want this to last, and are looking for VFS changes + ** that will allow us to get rid of it. + ** -- mason@suse.com + */ + if (sb->s_op->read_inode2) { + sb->s_op->read_inode2(inode, opaque) ; + } else { + sb->s_op->read_inode(inode); + } /* * This is special! We do not need the spinlock diff -u --recursive --new-file v2.4.0/linux/fs/nfs/flushd.c linux/fs/nfs/flushd.c --- v2.4.0/linux/fs/nfs/flushd.c Wed Jun 21 07:25:17 2000 +++ linux/fs/nfs/flushd.c Wed Jan 10 14:18:29 2001 @@ -71,18 +71,17 @@ int status = 0; dprintk("NFS: writecache_init\n"); + + /* Create the RPC task */ + if (!(task = rpc_new_task(server->client, NULL, RPC_TASK_ASYNC))) + return -ENOMEM; + spin_lock(&nfs_flushd_lock); cache = server->rw_requests; if (cache->task) goto out_unlock; - /* Create the RPC task */ - status = -ENOMEM; - task = rpc_new_task(server->client, NULL, RPC_TASK_ASYNC); - if (!task) - goto out_unlock; - task->tk_calldata = server; cache->task = task; @@ -99,6 +98,7 @@ return 0; out_unlock: spin_unlock(&nfs_flushd_lock); + rpc_release_task(task); return status; } @@ -195,7 +195,9 @@ if (*q) { *q = inode->u.nfs_i.hash_next; NFS_FLAGS(inode) &= ~NFS_INO_FLUSH; + spin_unlock(&nfs_flushd_lock); iput(inode); + return; } out: spin_unlock(&nfs_flushd_lock); diff -u --recursive --new-file v2.4.0/linux/fs/proc/kcore.c linux/fs/proc/kcore.c --- v2.4.0/linux/fs/proc/kcore.c Thu Sep 7 08:43:49 2000 +++ linux/fs/proc/kcore.c Mon Jan 15 16:54:20 2001 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff -u --recursive --new-file v2.4.0/linux/fs/ramfs/inode.c linux/fs/ramfs/inode.c --- v2.4.0/linux/fs/ramfs/inode.c Fri Dec 29 19:26:31 2000 +++ linux/fs/ramfs/inode.c Fri Jan 5 23:06:19 2001 @@ -81,6 +81,7 @@ static int ramfs_writepage(struct page *page) { SetPageDirty(page); + UnlockPage(page); return 0; } diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/Makefile linux/fs/reiserfs/Makefile --- v2.4.0/linux/fs/reiserfs/Makefile Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/Makefile Mon Jan 15 12:42:32 2001 @@ -0,0 +1,20 @@ +# +# Makefile for the linux reiser-filesystem routines. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definitions are now in the main makefile... + +O_TARGET := reiserfs.o +obj-y := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o super.o prints.o objectid.o \ +lbalance.o ibalance.o stree.o hashes.o buffer2.o tail_conversion.o journal.o resize.o tail_conversion.o version.o item_ops.o ioctl.o + +obj-m := $(O_TARGET) + +include $(TOPDIR)/Rules.make + +TAGS: + etags *.c + diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/README linux/fs/reiserfs/README --- v2.4.0/linux/fs/reiserfs/README Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/README Mon Jan 15 12:42:32 2001 @@ -0,0 +1,157 @@ +[LICENSING] + +ReiserFS is hereby licensed under the GNU General +Public License version 2. + +Source code files that contain the phrase "licensing governed by +reiserfs/README" are "governed files" throughout this file. Governed +files are licensed under the GPL. The portions of them owned by Hans +Reiser, or authorized to be licensed by him, have been in the past, +and likely will be in the future, licensed to other parties under +other licenses. If you add your code to governed files, and don't +want it to be owned by Hans Reiser, put your copyright label on that +code so the poor blight and his customers can keep things straight. +All portions of governed files not labeled otherwise are owned by Hans +Reiser, and by adding your code to it, widely distributing it to +others or sending us a patch, and leaving the sentence in stating that +licensing is governed by the statement in this file, you accept this. +It will be a kindness if you identify whether Hans Reiser is allowed +to license code labeled as owned by you on your behalf other than +under the GPL, because he wants to know if it is okay to do so and put +a check in the mail to you (for non-trivial improvements) when he +makes his next sale. He makes no guarantees as to the amount if any, +though he feels motivated to motivate contributors, and you can surely +discuss this with him before or after contributing. You have the +right to decline to allow him to license your code contribution other +than under the GPL. + +Further licensing options are available for commercial and/or other +interests directly from Hans Reiser: hans@reiser.to. If you interpret +the GPL as not allowing those additional licensing options, you read +it wrongly, and Richard Stallman agrees with me, when carefully read +you can see that those restrictions on additional terms do not apply +to the owner of the copyright, and my interpretation of this shall +govern for this license. + +Finally, nothing in this license shall be interpreted to allow you to +fail to fairly credit me, or to remove my credits, without my +permission, unless you are an end user not redistributing to others. +If you have doubts about how to properly do that, or about what is +fair, ask. (Last I spoke with him Richard was contemplating how best +to address the fair crediting issue in the next GPL version.) + +[END LICENSING] + +Reiserfs is a file system based on balanced tree algorithms, which is +described at http://devlinux.com/namesys. + +Stop reading here. Go there, then return. + +Send bug reports to yura@namesys.botik.ru. + +mkreiserfs and other utilities are in reiserfs/utils, or wherever your +Linux provider put them. There is some disagreement about how useful +it is for users to get their fsck and mkreiserfs out of sync with the +version of reiserfs that is in their kernel, with many important +distributors wanting them out of sync.:-) Please try to remember to +recompile and reinstall fsck and mkreiserfs with every update of +reiserfs, this is a common source of confusion. Note that some of the +utilities cannot be compiled without accessing the balancing code +which is in the kernel code, and relocating the utilities may require +you to specify where that code can be found. + +Yes, if you update your reiserfs kernel module you do have to +recompile your kernel, most of the time. The errors you get will be +quite cryptic if your forget to do so. + +Real users, as opposed to folks who want to hack and then understand +what went wrong, will want REISERFS_CHECK off. + +Hideous Commercial Pitch: Spread your development costs across other OS +vendors. Select from the best in the world, not the best in your +building, by buying from third party OS component suppliers. Leverage +the software component development power of the internet. Be the most +aggressive in taking advantage of the commercial possibilities of +decentralized internet development, and add value through your branded +integration that you sell as an operating system. Let your competitors +be the ones to compete against the entire internet by themselves. Be +hip, get with the new economic trend, before your competitors do. Send +email to hans@reiser.to. + +To understand the code, after reading the website, start reading the +code by reading reiserfs_fs.h first. + +Hans Reiser was the project initiator, primary architect, source of all +funding for the first 5.5 years, and one of the programmers. He owns +the copyright. + +Vladimir Saveljev was one of the programmers, and he worked long hours +writing the cleanest code. He always made the effort to be the best he +could be, and to make his code the best that it could be. What resulted +was quite remarkable. I don't think that money can ever motivate someone +to work the way he did, he is one of the most selfless men I know. + +Yura helps with benchmarking, coding hashes, and block pre-allocation +code. + +Anatoly Pinchuk is a former member of our team who worked closely with +Vladimir throughout the project's development. He wrote a quite +substantial portion of the total code. He realized that there was a +space problem with packing tails of files for files larger than a node +that start on a node aligned boundary (there are reasons to want to node +align files), and he invented and implemented indirect items and +unformatted nodes as the solution. + +Konstantin Shvachko, with the help of the Russian version of a VC, +tried to put me in a position where I was forced into giving control +of the project to him. (Fortunately, as the person paying the money +for all salaries from my dayjob I owned all copyrights, and you can't +really force takeovers of sole proprietorships.) This was something +curious, because he never really understood the value of our project, +why we should do what we do, or why innovation was possible in +general, but he was sure that he ought to be controlling it. Every +innovation had to be forced past him while he was with us. He added +two years to the time required to complete reiserfs, and was a net +loss for me. Mikhail Gilula was a brilliant innovator who also left +in a destructive way that erased the value of his contributions, and +that he was shown much generosity just makes it more painful. + +Grigory Zaigralin was an extremely effective system administrator for +our group. + +Igor Krasheninnikov was wonderful at hardware procurement, repair, and +network installation. + +Jeremy Fitzhardinge wrote the teahash.c code, and he gives credit to a +textbook he got the algorithm from in the code. Note that his analysis +of how we could use the hashing code in making 32 bit NFS cookies work +was probably more important than the actual algorithm. Colin Plumb also +contributed to it. + +Chris Mason dived right into our code, and in just a few months produced +the journaling code that dramatically increased the value of ReiserFS. +He is just an amazing programmer. + +Igor Zagorovsky is writing much of the new item handler and extent code +for our next major release. + +Alexander Zarochentcev (sometimes known as zam, or sasha), wrote the +resizer, and is hard at work on implementing allocate on flush. SGI +implemented allocate on flush before us for XFS, and generously took +the time to convince me we should do it also. They are great people, +and a great company. + +Yuri Shevchuk and Nikita Danilov are doing squid cache optimization. + +Vitaly Fertman is doing fsck. + +SuSE, IntegratedLinux.com, Ecila, MP3.com, bigstorage.com, and the +Alpha PC Company made it possible for me to not have a day job +anymore, and to dramatically increase our staffing. Ecila funded +hypertext feature development, MP3.com funded journaling, SuSE funded +core development, IntegratedLinux.com funded squid web cache +appliances, bigstorage.com funded HSM, and the alpha PC company funded +the alpha port. Many of these tasks were helped by sponsors other +than the ones just named. SuSE has helped in much more than just +funding.... + diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/bitmap.c linux/fs/reiserfs/bitmap.c --- v2.4.0/linux/fs/reiserfs/bitmap.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/bitmap.c Mon Jan 15 15:31:19 2001 @@ -0,0 +1,679 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include + +#else + +#include "nokernel.h" + +#endif + + +#ifdef CONFIG_REISERFS_CHECK + +/* this is a safety check to make sure +** blocks are reused properly. used for debugging only. +** +** this checks, that block can be reused, and it has correct state +** (free or busy) +*/ +int is_reusable (struct super_block * s, unsigned long block, int bit_value) +{ + int i, j; + + if (block == 0 || block >= SB_BLOCK_COUNT (s)) { + reiserfs_warning ("vs-4010: is_reusable: block number is out of range %lu (%u)\n", + block, SB_BLOCK_COUNT (s)); + return 0; + } + + /* it can't be one of the bitmap blocks */ + for (i = 0; i < SB_BMAP_NR (s); i ++) + if (block == SB_AP_BITMAP (s)[i]->b_blocknr) { + reiserfs_warning ("vs: 4020: is_reusable: " + "bitmap block %lu(%u) can't be freed or reused\n", + block, SB_BMAP_NR (s)); + return 0; + } + + i = block / (s->s_blocksize << 3); + if (i >= SB_BMAP_NR (s)) { + reiserfs_warning ("vs-4030: is_reusable: there is no so many bitmap blocks: " + "block=%lu, bitmap_nr=%d\n", block, i); + return 0; + } + + j = block % (s->s_blocksize << 3); + if ((bit_value == 0 && + reiserfs_test_le_bit(j, SB_AP_BITMAP(s)[i]->b_data)) || + (bit_value == 1 && + reiserfs_test_le_bit(j, SB_AP_BITMAP (s)[i]->b_data) == 0)) { + reiserfs_warning ("vs-4040: is_reusable: corresponding bit of block %lu does not " + "match required value (i==%d, j==%d) test_bit==%d\n", + block, i, j, reiserfs_test_le_bit (j, SB_AP_BITMAP (s)[i]->b_data)); + return 0; + } + + if (bit_value == 0 && block == SB_ROOT_BLOCK (s)) { + reiserfs_warning ("vs-4050: is_reusable: this is root block (%u), " + "it must be busy", SB_ROOT_BLOCK (s)); + return 0; + } + + return 1; +} + + + + +#endif /* CONFIG_REISERFS_CHECK */ + +#if 0 +/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ +int is_used (struct super_block * s, unsigned long block) +{ + int i, j; + + i = block / (s->s_blocksize << 3); + j = block % (s->s_blocksize << 3); + if (reiserfs_test_le_bit(j, SB_AP_BITMAP (s)[i]->b_data)) + return 1; + return 0; + +} +/*%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%*/ +#endif + + +/* get address of corresponding bit (bitmap block number and offset in it) */ +static inline void get_bit_address (struct super_block * s, unsigned long block, int * bmap_nr, int * offset) +{ + /* It is in the bitmap block number equal to the block number divided by the number of + bits in a block. */ + *bmap_nr = block / (s->s_blocksize << 3); + /* Within that bitmap block it is located at bit offset *offset. */ + *offset = block % (s->s_blocksize << 3); + return; +} + + +/* There would be a modest performance benefit if we write a version + to free a list of blocks at once. -Hans */ + /* I wonder if it would be less modest + now that we use journaling. -Hans */ +void reiserfs_free_block (struct reiserfs_transaction_handle *th, unsigned long block) +{ + struct super_block * s = th->t_super; + struct reiserfs_super_block * rs; + struct buffer_head * sbh; + struct buffer_head ** apbh; + int nr, offset; + +#ifdef CONFIG_REISERFS_CHECK + if (!s) + reiserfs_panic (s, "vs-4060: reiserfs_free_block: trying to free block on nonexistent device"); + + if (is_reusable (s, block, 1) == 0) + reiserfs_panic (s, "vs-4070: reiserfs_free_block: can not free such block"); +#endif + + rs = SB_DISK_SUPER_BLOCK (s); + sbh = SB_BUFFER_WITH_SB (s); + apbh = SB_AP_BITMAP (s); + + get_bit_address (s, block, &nr, &offset); + + /* mark it before we clear it, just in case */ + journal_mark_freed(th, s, block) ; + + reiserfs_prepare_for_journal(s, apbh[nr], 1 ) ; + + /* clear bit for the given block in bit map */ + if (!reiserfs_test_and_clear_le_bit (offset, apbh[nr]->b_data)) { + reiserfs_warning ("vs-4080: reiserfs_free_block: " + "free_block (%04x:%lu)[dev:blocknr]: bit already cleared\n", + s->s_dev, block); + } + journal_mark_dirty (th, s, apbh[nr]); + + reiserfs_prepare_for_journal(s, sbh, 1) ; + /* update super block */ + rs->s_free_blocks = cpu_to_le32 (le32_to_cpu (rs->s_free_blocks) + 1); + + journal_mark_dirty (th, s, sbh); + s->s_dirt = 1; +} + + + +/* beginning from offset-th bit in bmap_nr-th bitmap block, + find_forward finds the closest zero bit. It returns 1 and zero + bit address (bitmap, offset) if zero bit found or 0 if there is no + zero bit in the forward direction */ +/* The function is NOT SCHEDULE-SAFE! */ +static int find_forward (struct super_block * s, int * bmap_nr, int * offset, int for_unformatted) +{ + int i, j; + struct buffer_head * bh; + unsigned long block_to_try = 0; + unsigned long next_block_to_try = 0 ; + + for (i = *bmap_nr; i < SB_BMAP_NR (s); i ++, *offset = 0) { + /* get corresponding bitmap block */ + bh = SB_AP_BITMAP (s)[i]; + if (buffer_locked (bh)) { + __wait_on_buffer (bh); + } +retry: + j = reiserfs_find_next_zero_le_bit ((unsigned long *)bh->b_data, + s->s_blocksize << 3, *offset); + + /* wow, this really needs to be redone. We can't allocate a block if + ** it is in the journal somehow. reiserfs_in_journal makes a suggestion + ** for a good block if the one you ask for is in the journal. Note, + ** reiserfs_in_journal might reject the block it suggests. The big + ** gain from the suggestion is when a big file has been deleted, and + ** many blocks show free in the real bitmap, but are all not free + ** in the journal list bitmaps. + ** + ** this whole system sucks. The bitmaps should reflect exactly what + ** can and can't be allocated, and the journal should update them as + ** it goes. TODO. + */ + if (j < (s->s_blocksize << 3)) { + block_to_try = (i * (s->s_blocksize << 3)) + j; + + /* the block is not in the journal, we can proceed */ + if (!(reiserfs_in_journal(s, s->s_dev, block_to_try, s->s_blocksize, for_unformatted, &next_block_to_try))) { + *bmap_nr = i; + *offset = j; + return 1; + } + /* the block is in the journal */ + else if ((j+1) < (s->s_blocksize << 3)) { /* try again */ + /* reiserfs_in_journal suggested a new block to try */ + if (next_block_to_try > 0) { + int new_i ; + get_bit_address (s, next_block_to_try, &new_i, offset); + + /* block is not in this bitmap. reset i and continue + ** we only reset i if new_i is in a later bitmap. + */ + if (new_i > i) { + i = (new_i - 1 ); /* i gets incremented by the for loop */ + continue ; + } + } else { + /* no suggestion was made, just try the next block */ + *offset = j+1 ; + } + goto retry ; + } + } + } + /* zero bit not found */ + return 0; +} + +/* return 0 if no free blocks, else return 1 */ +/* The function is NOT SCHEDULE-SAFE! +** because the bitmap block we want to change could be locked, and on its +** way to the disk when we want to read it, and because of the +** flush_async_commits. Per bitmap block locks won't help much, and +** really aren't needed, as we retry later on if we try to set the bit +** and it is already set. +*/ +static int find_zero_bit_in_bitmap (struct super_block * s, + unsigned long search_start, + int * bmap_nr, int * offset, + int for_unformatted) +{ + int retry_count = 0 ; + /* get bit location (bitmap number and bit offset) of search_start block */ + get_bit_address (s, search_start, bmap_nr, offset); + + /* note that we search forward in the bitmap, benchmarks have shown that it is better to allocate in increasing + sequence, which is probably due to the disk spinning in the forward direction.. */ + if (find_forward (s, bmap_nr, offset, for_unformatted) == 0) { + /* there wasn't a free block with number greater than our + starting point, so we are going to go to the beginning of the disk */ + +retry: + search_start = 0; /* caller will reset search_start for itself also. */ + get_bit_address (s, search_start, bmap_nr, offset); + if (find_forward (s, bmap_nr,offset,for_unformatted) == 0) { + if (for_unformatted) { /* why only unformatted nodes? -Hans */ + if (retry_count == 0) { + /* we've got a chance that flushing async commits will free up + ** some space. Sync then retry + */ + flush_async_commits(s) ; + retry_count++ ; + goto retry ; + } else if (retry_count > 0) { + /* nothing more we can do. Make the others wait, flush + ** all log blocks to disk, and flush to their home locations. + ** this will free up any blocks held by the journal + */ + SB_JOURNAL(s)->j_must_wait = 1 ; + } + } + return 0; + } + } + return 1; +} + +/* get amount_needed free block numbers from scanning the bitmap of + free/used blocks. + + Optimize layout by trying to find them starting from search_start + and moving in increasing blocknr direction. (This was found to be + faster than using a bi-directional elevator_direction, in part + because of disk spin direction, in part because by the time one + reaches the end of the disk the beginning of the disk is the least + congested). + + search_start is the block number of the left + semantic neighbor of the node we create. + + return CARRY_ON if everything is ok + return NO_DISK_SPACE if out of disk space + return NO_MORE_UNUSED_CONTIGUOUS_BLOCKS if the block we found is not contiguous to the last one + + return block numbers found, in the array free_blocknrs. assumes + that any non-zero entries already present in the array are valid. + This feature is perhaps convenient coding when one might not have + used all blocknrs from the last time one called this function, or + perhaps it is an archaism from the days of schedule tracking, one + of us ought to reread the code that calls this, and analyze whether + it is still the right way to code it. + + spare space is used only when priority is set to 1. reiserfsck has + its own reiserfs_new_blocknrs, which can use reserved space + + exactly what reserved space? the SPARE_SPACE? if so, please comment reiserfs.h. + + Give example of who uses spare space, and say that it is a deadlock + avoidance mechanism. -Hans */ + +/* This function is NOT SCHEDULE-SAFE! */ + +static int do_reiserfs_new_blocknrs (struct reiserfs_transaction_handle *th, + unsigned long * free_blocknrs, + unsigned long search_start, + int amount_needed, int priority, + int for_unformatted, + int for_prealloc) +{ + struct super_block * s = th->t_super; + int i, j; + unsigned long * block_list_start = free_blocknrs; + int init_amount_needed = amount_needed; + unsigned long new_block = 0 ; + + if (SB_FREE_BLOCKS (s) < SPARE_SPACE && !priority) + /* we can answer NO_DISK_SPACE being asked for new block with + priority 0 */ + return NO_DISK_SPACE; + +#ifdef CONFIG_REISERFS_CHECK + if (!s) + reiserfs_panic (s, "vs-4090: reiserfs_new_blocknrs: trying to get new block from nonexistent device"); + + if (search_start == MAX_B_NUM) + reiserfs_panic (s, "vs-4100: reiserfs_new_blocknrs: we are optimizing location based on " + "the bogus location of a temp buffer (%lu).", search_start); + + if (amount_needed < 1 || amount_needed > 2) + reiserfs_panic (s, "vs-4110: reiserfs_new_blocknrs: amount_needed parameter incorrect (%d)", amount_needed); +#endif /* CONFIG_REISERFS_CHECK */ + + /* We continue the while loop if another process snatches our found + * free block from us after we find it but before we successfully + * mark it as in use, or if we need to use sync to free up some + * blocks on the preserve list. */ + + while (amount_needed--) { + /* skip over any blocknrs already gotten last time. */ + if (*(free_blocknrs) != 0) { +#ifdef CONFIG_REISERFS_CHECK + if (is_reusable (s, *free_blocknrs, 1) == 0) + reiserfs_panic(s, "vs-4120: reiserfs_new_blocknrs: bad blocknr on free_blocknrs list"); +#endif /* CONFIG_REISERFS_CHECK */ + free_blocknrs++; + continue; + } + /* look for zero bits in bitmap */ + if (find_zero_bit_in_bitmap(s,search_start, &i, &j,for_unformatted) == 0) { + if (find_zero_bit_in_bitmap(s,search_start,&i,&j, for_unformatted) == 0) { + /* recode without the goto and without + the if. It will require a + duplicate for. This is worth the + code clarity. Your way was + admirable, and just a bit too + clever in saving instructions.:-) + I'd say create a new function, but + that would slow things also, yes? + -Hans */ +free_and_return: + for ( ; block_list_start != free_blocknrs; block_list_start++) { + reiserfs_free_block (th, *block_list_start); + *block_list_start = 0; + } + if (for_prealloc) + return NO_MORE_UNUSED_CONTIGUOUS_BLOCKS; + else + return NO_DISK_SPACE; + } + } + + /* i and j now contain the results of the search. i = bitmap block + number containing free block, j = offset in this block. we + compute the blocknr which is our result, store it in + free_blocknrs, and increment the pointer so that on the next + loop we will insert into the next location in the array. Also + in preparation for the next loop, search_start is changed so + that the next search will not rescan the same range but will + start where this search finished. Note that while it is + possible that schedule has occurred and blocks have been freed + in that range, it is perhaps more important that the blocks + returned be near each other than that they be near their other + neighbors, and it also simplifies and speeds the code this way. */ + + /* journal: we need to make sure the block we are giving out is not + ** a log block, horrible things would happen there. + */ + new_block = (i * (s->s_blocksize << 3)) + j; + if (for_prealloc && (new_block - 1) != search_start) { + /* preallocated blocks must be contiguous, bail if we didnt find one. + ** this is not a bug. We want to do the check here, before the + ** bitmap block is prepared, and before we set the bit and log the + ** bitmap. + ** + ** If we do the check after this function returns, we have to + ** call reiserfs_free_block for new_block, which would be pure + ** overhead. + ** + ** for_prealloc should only be set if the caller can deal with the + ** NO_MORE_UNUSED_CONTIGUOUS_BLOCKS return value. This can be + ** returned before the disk is actually full + */ + goto free_and_return ; + } + search_start = new_block ; + if (search_start >= reiserfs_get_journal_block(s) && + search_start < (reiserfs_get_journal_block(s) + JOURNAL_BLOCK_COUNT)) { + reiserfs_warning("vs-4130: reiserfs_new_blocknrs: trying to allocate log block %lu\n", + search_start) ; + search_start++ ; + amount_needed++ ; + continue ; + } + + + reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[i], 1) ; + +#ifdef CONFIG_REISERFS_CHECK + if (buffer_locked (SB_AP_BITMAP (s)[i]) || is_reusable (s, search_start, 0) == 0) + reiserfs_panic (s, "vs-4140: reiserfs_new_blocknrs: bitmap block is locked or bad block number found"); +#endif + + /* if this bit was already set, we've scheduled, and someone else + ** has allocated it. loop around and try again + */ + if (reiserfs_test_and_set_le_bit (j, SB_AP_BITMAP (s)[i]->b_data)) { + reiserfs_warning("vs-4150: reiserfs_new_blocknrs, block not free"); + reiserfs_restore_prepared_buffer(s, SB_AP_BITMAP(s)[i]) ; + amount_needed++ ; + continue ; + } + journal_mark_dirty (th, s, SB_AP_BITMAP (s)[i]); + *free_blocknrs = search_start ; + free_blocknrs ++; + } + + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; + /* update free block count in super block */ + s->u.reiserfs_sb.s_rs->s_free_blocks = cpu_to_le32 (SB_FREE_BLOCKS (s) - init_amount_needed); + journal_mark_dirty (th, s, SB_BUFFER_WITH_SB (s)); + s->s_dirt = 1; + + return CARRY_ON; +} + +// this is called only by get_empty_nodes with for_preserve_list==0 +int reiserfs_new_blocknrs (struct reiserfs_transaction_handle *th, unsigned long * free_blocknrs, + unsigned long search_start, int amount_needed) { + return do_reiserfs_new_blocknrs(th, free_blocknrs, search_start, amount_needed, 0/*for_preserve_list-priority*/, 0/*for_formatted*/, 0/*for_prealloc */) ; +} + + +// called by get_new_buffer and by reiserfs_get_block with amount_needed == 1 and for_preserve_list == 0 +int reiserfs_new_unf_blocknrs(struct reiserfs_transaction_handle *th, unsigned long * free_blocknrs, + unsigned long search_start) { + return do_reiserfs_new_blocknrs(th, free_blocknrs, search_start, + 1/*amount_needed*/, + 0/*for_preserve_list-priority*/, + 1/*for formatted*/, + 0/*for prealloc */) ; +} + +#ifdef REISERFS_PREALLOCATE + +/* +** We pre-allocate 8 blocks. Pre-allocation is used for files > 16 KB only. +** This lowers fragmentation on large files by grabbing a contiguous set of +** blocks at once. It also limits the number of times the bitmap block is +** logged by making X number of allocation changes in a single transaction. +** +** We are using a border to divide the disk into two parts. The first part +** is used for tree blocks, which have a very high turnover rate (they +** are constantly allocated then freed) +** +** The second part of the disk is for the unformatted nodes of larger files. +** Putting them away from the tree blocks lowers fragmentation, and makes +** it easier to group files together. There are a number of different +** allocation schemes being tried right now, each is documented below. +** +** A great deal of the allocator's speed comes because reiserfs_get_block +** sends us the block number of the last unformatted node in the file. Once +** a given block is allocated past the border, we don't collide with the +** blocks near the search_start again. +** +*/ +int reiserfs_new_unf_blocknrs2 (struct reiserfs_transaction_handle *th, + struct inode * p_s_inode, + unsigned long * free_blocknrs, + unsigned long search_start) +{ + int ret=0, blks_gotten=0; + unsigned long border = 0; + unsigned long bstart = 0; + unsigned long hash_in, hash_out; + int allocated[PREALLOCATION_SIZE]; + int blks; + + if (!reiserfs_no_border(th->t_super)) { + /* we default to having the border at the 10% mark of the disk. This + ** is an arbitrary decision and it needs tuning. It also needs a limit + ** to prevent it from taking too much space on huge drives. + */ + bstart = (SB_BLOCK_COUNT(th->t_super) / 10); + } + if (!reiserfs_no_unhashed_relocation(th->t_super)) { + /* this is a very simple first attempt at preventing too much grouping + ** around the border value. Since k_dir_id is never larger than the + ** highest allocated oid, it is far from perfect, and files will tend + ** to be grouped towards the start of the border + */ + border = (INODE_PKEY(p_s_inode)->k_dir_id) % (SB_BLOCK_COUNT(th->t_super) - bstart - 1) ; + } else { + /* why would we want to delcare a local variable to this if statement + ** name border????? -chris + ** unsigned long border = 0; + */ + if (!reiserfs_hashed_relocation(th->t_super)) { + hash_in = (INODE_PKEY(p_s_inode))->k_dir_id; + /* I wonder if the CPU cost of the + hash will obscure the layout + effect? Of course, whether that + effect is good or bad we don't + know.... :-) */ + + hash_out = keyed_hash(((char *) (&hash_in)), 4); + border = hash_out % (SB_BLOCK_COUNT(th->t_super) - bstart - 1) ; + } + } + border += bstart ; + allocated[0] = 0 ; /* important. Allows a check later on to see if at + * least one block was allocated. This prevents false + * no disk space returns + */ + + if ( (p_s_inode->i_size < 4 * 4096) || + !(S_ISREG(p_s_inode->i_mode)) ) + { + if ( search_start < border + || ( + /* allow us to test whether it is a + good idea to prevent files from + getting too far away from their + packing locality by some unexpected + means. This might be poor code for + directories whose files total + larger than 1/10th of the disk, and + it might be good code for + suffering from old insertions when the disk + was almost full. */ + /* changed from !reiserfs_test3(th->t_super), which doesn't + ** seem like a good idea. Think about adding blocks to + ** a large file. If you've allocated 10% of the disk + ** in contiguous blocks, you start over at the border value + ** for every new allocation. This throws away all the + ** information sent in about the last block that was allocated + ** in the file. Not a good general case at all. + ** -chris + */ + reiserfs_test4(th->t_super) && + (search_start > border + (SB_BLOCK_COUNT(th->t_super) / 10)) + ) + ) + search_start=border; + + ret = do_reiserfs_new_blocknrs(th, free_blocknrs, search_start, + 1/*amount_needed*/, + 0/*use reserved blocks for root */, + 1/*for_formatted*/, + 0/*for prealloc */) ; + return ret; + } + + /* take a block off the prealloc list and return it -Hans */ + if (p_s_inode->u.reiserfs_i.i_prealloc_count > 0) { + p_s_inode->u.reiserfs_i.i_prealloc_count--; + *free_blocknrs = p_s_inode->u.reiserfs_i.i_prealloc_block++; + return ret; + } + + /* else get a new preallocation for the file */ + reiserfs_discard_prealloc (th, p_s_inode); + /* this uses the last preallocated block as the search_start. discard + ** prealloc does not zero out this number. + */ + if (search_start <= p_s_inode->u.reiserfs_i.i_prealloc_block) { + search_start = p_s_inode->u.reiserfs_i.i_prealloc_block; + } + + /* doing the compare again forces search_start to be >= the border, + ** even if the file already had prealloction done. This seems extra, + ** and should probably be removed + */ + if ( search_start < border ) search_start=border; + + *free_blocknrs = 0; + blks = PREALLOCATION_SIZE-1; + for (blks_gotten=0; blks_gotten 0)/*must_be_contiguous*/) ; + /* if we didn't find a block this time, adjust blks to reflect + ** the actual number of blocks allocated + */ + if (ret != CARRY_ON) { + blks = blks_gotten > 0 ? (blks_gotten - 1) : 0 ; + break ; + } + allocated[blks_gotten]= *free_blocknrs; +#ifdef CONFIG_REISERFS_CHECK + if ( (blks_gotten>0) && (allocated[blks_gotten] - allocated[blks_gotten-1]) != 1 ) { + /* this should be caught by new_blocknrs now, checking code */ + reiserfs_warning("yura-1, reiserfs_new_unf_blocknrs2: pre-allocated not contiguous set of blocks!\n") ; + reiserfs_free_block(th, allocated[blks_gotten]); + blks = blks_gotten-1; + break; + } +#endif + if (blks_gotten==0) { + p_s_inode->u.reiserfs_i.i_prealloc_block = *free_blocknrs; + } + search_start = *free_blocknrs; + *free_blocknrs = 0; + } + p_s_inode->u.reiserfs_i.i_prealloc_count = blks; + *free_blocknrs = p_s_inode->u.reiserfs_i.i_prealloc_block; + p_s_inode->u.reiserfs_i.i_prealloc_block++; + + /* we did actually manage to get 1 block */ + if (ret != CARRY_ON && allocated[0] > 0) { + return CARRY_ON ; + } + /* NO_MORE_UNUSED_CONTIGUOUS_BLOCKS should only mean something to + ** the preallocation code. The rest of the filesystem asks for a block + ** and should either get it, or know the disk is full. The code + ** above should never allow ret == NO_MORE_UNUSED_CONTIGUOUS_BLOCK, + ** as it doesn't send for_prealloc = 1 to do_reiserfs_new_blocknrs + ** unless it has already successfully allocated at least one block. + ** Just in case, we translate into a return value the rest of the + ** filesystem can understand. + ** + ** It is an error to change this without making the + ** rest of the filesystem understand NO_MORE_UNUSED_CONTIGUOUS_BLOCKS + ** If you consider it a bug to return NO_DISK_SPACE here, fix the rest + ** of the fs first. + */ + if (ret == NO_MORE_UNUSED_CONTIGUOUS_BLOCKS) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning("reiser-2015: this shouldn't happen, may cause false out of disk space error"); +#endif + return NO_DISK_SPACE; + } + return ret; +} + +// +// a portion of this function, was derived from minix or ext2's +// analog. You should be able to tell which portion by looking at the +// ext2 code and comparing. + +void reiserfs_discard_prealloc (struct reiserfs_transaction_handle *th, + struct inode * inode) +{ + if (inode->u.reiserfs_i.i_prealloc_count > 0) { + while (inode->u.reiserfs_i.i_prealloc_count--) { + reiserfs_free_block(th,inode->u.reiserfs_i.i_prealloc_block); + inode->u.reiserfs_i.i_prealloc_block++; + } + } + inode->u.reiserfs_i.i_prealloc_count = 0; +} +#endif diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/buffer2.c linux/fs/reiserfs/buffer2.c --- v2.4.0/linux/fs/reiserfs/buffer2.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/buffer2.c Mon Jan 15 15:31:19 2001 @@ -0,0 +1,358 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + + +/* + * Contains code from + * + * linux/include/linux/lock.h and linux/fs/buffer.c /linux/fs/minix/fsync.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include + +#else + +#include "nokernel.h" + +#endif + + +/* + * wait_buffer_until_released + * reiserfs_bread + * reiserfs_getblk + * get_new_buffer + */ + + + +/* when we allocate a new block (get_new_buffer, get_empty_nodes) and + get buffer for it, it is possible that it is held by someone else + or even by this process. In this function we wait until all other + holders release buffer. To make sure, that current process does not + hold we did free all buffers in tree balance structure + (get_empty_nodes and get_nodes_for_preserving) or in path structure + only (get_new_buffer) just before calling this */ +void wait_buffer_until_released (struct buffer_head * bh) +{ + int repeat_counter = 0; + + while (atomic_read (&(bh->b_count)) > 1) { + + if ( !(++repeat_counter % 30000000) ) { + reiserfs_warning ("vs-3050: wait_buffer_until_released: nobody releases buffer (%b). Still waiting (%d) %cJDIRTY %cJWAIT\n", + bh, repeat_counter, buffer_journaled(bh) ? ' ' : '!', + buffer_journal_dirty(bh) ? ' ' : '!'); + } + run_task_queue(&tq_disk); + current->policy |= SCHED_YIELD; + /*current->counter = 0;*/ + schedule(); + } + if (repeat_counter > 30000000) { + reiserfs_warning("vs-3051: done waiting, ignore vs-3050 messages for (%b)\n", bh) ; + } +} + +/* + * reiserfs_bread() reads a specified block and returns the buffer that contains + * it. It returns NULL if the block was unreadable. + */ +/* It first tries to find the block in cache, and if it cannot do so + then it creates a new buffer and schedules I/O to read the + block. */ +/* The function is NOT SCHEDULE-SAFE! */ + +struct buffer_head * reiserfs_bread (kdev_t n_dev, int n_block, int n_size) +{ + return bread (n_dev, n_block, n_size); +} + +/* This function looks for a buffer which contains a given block. If + the block is in cache it returns it, otherwise it returns a new + buffer which is not uptodate. This is called by reiserfs_bread and + other functions. Note that get_new_buffer ought to be called this + and this ought to be called get_new_buffer, since this doesn't + actually get the block off of the disk. */ +/* The function is NOT SCHEDULE-SAFE! */ + +struct buffer_head * reiserfs_getblk (kdev_t n_dev, int n_block, int n_size) +{ + return getblk (n_dev, n_block, n_size); +} + +#ifdef NEW_GET_NEW_BUFFER + +/* returns one buffer with a blocknr near blocknr. */ +static int get_new_buffer_near_blocknr( + struct super_block * p_s_sb, + int blocknr, + struct buffer_head ** pp_s_new_bh, + struct path * p_s_path + ) { + unsigned long n_new_blocknumber = 0; + int n_ret_value, + n_repeat = CARRY_ON; + +#ifdef CONFIG_REISERFS_CHECK + int repeat_counter = 0; + + if (!blocknr) + printk ("blocknr passed to get_new_buffer_near_blocknr was 0"); +#endif + + + if ( (n_ret_value = reiserfs_new_blocknrs (p_s_sb, &n_new_blocknumber, + blocknr, 1)) == NO_DISK_SPACE ) + return NO_DISK_SPACE; + + *pp_s_new_bh = reiserfs_getblk(p_s_sb->s_dev, n_new_blocknumber, p_s_sb->s_blocksize); + if ( buffer_uptodate(*pp_s_new_bh) ) { + +#ifdef CONFIG_REISERFS_CHECK + if ( buffer_dirty(*pp_s_new_bh) || (*pp_s_new_bh)->b_dev == NODEV ) { + reiserfs_panic(p_s_sb, "PAP-14080: get_new_buffer: invalid uptodate buffer %b for the new block", *pp_s_new_bh); + } +#endif + + /* Free path buffers to prevent deadlock. */ + /* It is possible that this process has the buffer, which this function is getting, already in + its path, and is responsible for double incrementing the value of b_count. If we recalculate + the path after schedule we can avoid risking an endless loop. This problematic situation is + possible in a multiple processing environment. Suppose process 1 has acquired a path P; then + process 2 balanced and remove block A from the tree. Process 1 continues and runs + get_new_buffer, that returns buffer with block A. If node A was on the path P, then it will + have b_count == 2. If we now will simply wait in while ( (*pp_s_new_bh)->b_count > 1 ) we get + into an endless loop, as nobody will release this buffer and the current process holds buffer + twice. That is why we do decrement_counters_in_path(p_s_path) before waiting until b_count + becomes 1. (it there were other processes holding node A, then eventually we will get a + moment, when all of them released a buffer). */ + if ( atomic_read (&((*pp_s_new_bh)->b_count)) > 1 ) { + decrement_counters_in_path(p_s_path); + n_ret_value |= SCHEDULE_OCCURRED; + } + + while ( atomic_read (&((*pp_s_new_bh)->b_count)) > 1 ) { + +#ifdef REISERFS_INFO + printk("get_new_buffer() calls schedule to decrement b_count\n"); +#endif + +#ifdef CONFIG_REISERFS_CHECK + if ( ! (++repeat_counter % 10000) ) + printk("get_new_buffer(%u): counter(%d) too big", current->pid, repeat_counter); +#endif + + current->counter = 0; + schedule(); + } + +#ifdef CONFIG_REISERFS_CHECK + if ( buffer_dirty(*pp_s_new_bh) || (*pp_s_new_bh)->b_dev == NODEV ) { + print_buffer_head(*pp_s_new_bh,"get_new_buffer"); + reiserfs_panic(p_s_sb, "PAP-14090: get_new_buffer: invalid uptodate buffer %b for the new block(case 2)", *pp_s_new_bh); + } +#endif + + } + else { + ; + +#ifdef CONFIG_REISERFS_CHECK + if (atomic_read (&((*pp_s_new_bh)->b_count)) != 1) { + reiserfs_panic(p_s_sb,"PAP-14100: get_new_buffer: not uptodate buffer %b for the new block has b_count more than one", + *pp_s_new_bh); + } +#endif + + } + return (n_ret_value | n_repeat); +} + + +/* returns the block number of the last unformatted node, assumes p_s_key_to_search.k_offset is a byte in the tail of + the file, Useful for when you want to append to a file, and convert a direct item into an unformatted node near the + last unformatted node of the file. Putting the unformatted node near the direct item is potentially very bad to do. + If there is no unformatted node in the file, then we return the block number of the direct item. */ +/* The function is NOT SCHEDULE-SAFE! */ +inline int get_last_unformatted_node_blocknr_of_file( struct key * p_s_key_to_search, struct super_block * p_s_sb, + struct buffer_head * p_s_bh + struct path * p_unf_search_path, struct inode * p_s_inode) + +{ + struct key unf_key_to_search; + struct item_head * p_s_ih; + int n_pos_in_item; + struct buffer_head * p_indirect_item_bh; + + copy_key(&unf_key_to_search,p_s_key_to_search); + unf_key_to_search.k_uniqueness = TYPE_INDIRECT; + unf_key_to_search.k_offset = p_s_inode->u.reiserfs_i.i_first_direct_byte - 1; + + /* p_s_key_to_search->k_offset - MAX_ITEM_LEN(p_s_sb->s_blocksize); */ + if (search_for_position_by_key (p_s_sb, &unf_key_to_search, p_unf_search_path, &n_pos_in_item) == POSITION_FOUND) + { + p_s_ih = B_N_PITEM_HEAD(p_indirect_item_bh = PATH_PLAST_BUFFER(p_unf_search_path), PATH_LAST_POSITION(p_unf_search_path)); + return (B_I_POS_UNFM_POINTER(p_indirect_item_bh, p_s_ih, n_pos_in_item)); + } + /* else */ + printk("reiser-1800: search for unformatted node failed, p_s_key_to_search->k_offset = %u, unf_key_to_search.k_offset = %u, MAX_ITEM_LEN(p_s_sb->s_blocksize) = %ld, debug this\n", p_s_key_to_search->k_offset, unf_key_to_search.k_offset, MAX_ITEM_LEN(p_s_sb->s_blocksize) ); + print_buffer_head(PATH_PLAST_BUFFER(p_unf_search_path), "the buffer holding the item before the key we failed to find"); + print_block_head(PATH_PLAST_BUFFER(p_unf_search_path), "the block head"); + return 0; /* keeps the compiler quiet */ +} + + + /* hasn't been out of disk space tested */ +/* The function is NOT SCHEDULE-SAFE! */ +static int get_buffer_near_last_unf ( struct super_block * p_s_sb, struct key * p_s_key_to_search, + struct inode * p_s_inode, struct buffer_head * p_s_bh, + struct buffer_head ** pp_s_un_bh, struct path * p_s_search_path) +{ + int unf_blocknr = 0, /* blocknr from which we start search for a free block for an unformatted node, if 0 + then we didn't find an unformatted node though we might have found a file hole */ + n_repeat = CARRY_ON; + struct key unf_key_to_search; + struct path unf_search_path; + + copy_key(&unf_key_to_search,p_s_key_to_search); + unf_key_to_search.k_uniqueness = TYPE_INDIRECT; + + if ( + (p_s_inode->u.reiserfs_i.i_first_direct_byte > 4095) /* i_first_direct_byte gets used for all sorts of + crap other than what the name indicates, thus + testing to see if it is 0 is not enough */ + && (p_s_inode->u.reiserfs_i.i_first_direct_byte < MAX_KEY_OFFSET) /* if there is no direct item then + i_first_direct_byte = MAX_KEY_OFFSET */ + ) + { + /* actually, we don't want the last unformatted node, we want the last unformatted node + which is before the current file offset */ + unf_key_to_search.k_offset = ((p_s_inode->u.reiserfs_i.i_first_direct_byte -1) < unf_key_to_search.k_offset) ? p_s_inode->u.reiserfs_i.i_first_direct_byte -1 : unf_key_to_search.k_offset; + + while (unf_key_to_search.k_offset > -1) + { + /* This is our poorly documented way of initializing paths. -Hans */ + init_path (&unf_search_path); + /* get the blocknr from which we start the search for a free block. */ + unf_blocknr = get_last_unformatted_node_blocknr_of_file( p_s_key_to_search, /* assumes this points to the file tail */ + p_s_sb, /* lets us figure out the block size */ + p_s_bh, /* if there is no unformatted node in the file, + then it returns p_s_bh->b_blocknr */ + &unf_search_path, + p_s_inode + ); +/* printk("in while loop: unf_blocknr = %d, *pp_s_un_bh = %p\n", unf_blocknr, *pp_s_un_bh); */ + if (unf_blocknr) + break; + else /* release the path and search again, this could be really slow for huge + holes.....better to spend the coding time adding compression though.... -Hans */ + { + /* Vladimir, is it a problem that I don't brelse these buffers ?-Hans */ + decrement_counters_in_path(&unf_search_path); + unf_key_to_search.k_offset -= 4096; + } + } + if (unf_blocknr) { + n_repeat |= get_new_buffer_near_blocknr(p_s_sb, unf_blocknr, pp_s_un_bh, p_s_search_path); + } + else { /* all unformatted nodes are holes */ + n_repeat |= get_new_buffer_near_blocknr(p_s_sb, p_s_bh->b_blocknr, pp_s_un_bh, p_s_search_path); + } + } + else { /* file has no unformatted nodes */ + n_repeat |= get_new_buffer_near_blocknr(p_s_sb, p_s_bh->b_blocknr, pp_s_un_bh, p_s_search_path); +/* printk("in else: unf_blocknr = %d, *pp_s_un_bh = %p\n", unf_blocknr, *pp_s_un_bh); */ +/* print_path (0, p_s_search_path); */ + } + + return n_repeat; +} + +#endif /* NEW_GET_NEW_BUFFER */ + + +#ifdef OLD_GET_NEW_BUFFER + +/* The function is NOT SCHEDULE-SAFE! */ +int get_new_buffer( + struct reiserfs_transaction_handle *th, + struct buffer_head * p_s_bh, + struct buffer_head ** pp_s_new_bh, + struct path * p_s_path + ) { + unsigned long n_new_blocknumber = 0; + int n_repeat; + struct super_block * p_s_sb = th->t_super; + + if ( (n_repeat = reiserfs_new_unf_blocknrs (th, &n_new_blocknumber, p_s_bh->b_blocknr)) == NO_DISK_SPACE ) + return NO_DISK_SPACE; + + *pp_s_new_bh = reiserfs_getblk(p_s_sb->s_dev, n_new_blocknumber, p_s_sb->s_blocksize); + if (atomic_read (&(*pp_s_new_bh)->b_count) > 1) { + /* Free path buffers to prevent deadlock which can occur in the + situation like : this process holds p_s_path; Block + (*pp_s_new_bh)->b_blocknr is on the path p_s_path, but it is + not necessary, that *pp_s_new_bh is in the tree; process 2 + could remove it from the tree and freed block + (*pp_s_new_bh)->b_blocknr. Reiserfs_new_blocknrs in above + returns block (*pp_s_new_bh)->b_blocknr. Reiserfs_getblk gets + buffer for it, and it has b_count > 1. If we now will simply + wait in while ( (*pp_s_new_bh)->b_count > 1 ) we get into an + endless loop, as nobody will release this buffer and the + current process holds buffer twice. That is why we do + decrement_counters_in_path(p_s_path) before waiting until + b_count becomes 1. (it there were other processes holding node + pp_s_new_bh, then eventually we will get a moment, when all of + them released a buffer). */ + decrement_counters_in_path(p_s_path); + wait_buffer_until_released (*pp_s_new_bh); + n_repeat |= SCHEDULE_OCCURRED; + } + +#ifdef CONFIG_REISERFS_CHECK + if ( atomic_read (&((*pp_s_new_bh)->b_count)) != 1 || buffer_dirty (*pp_s_new_bh)) { + reiserfs_panic(p_s_sb,"PAP-14100: get_new_buffer: not free or dirty buffer %b for the new block", + *pp_s_new_bh); + } +#endif + + return n_repeat; +} + +#endif /* OLD_GET_NEW_BUFFER */ + + +#ifdef GET_MANY_BLOCKNRS + /* code not yet functional */ +get_next_blocknr ( + unsigned long * p_blocknr_array, /* we get a whole bunch of blocknrs all at once for + the write. This is better than getting them one at + a time. */ + unsigned long ** p_blocknr_index, /* pointer to current offset into the array. */ + unsigned long blocknr_array_length +) +{ + unsigned long return_value; + + if (*p_blocknr_index < p_blocknr_array + blocknr_array_length) { + return_value = **p_blocknr_index; + **p_blocknr_index = 0; + *p_blocknr_index++; + return (return_value); + } + else + { + kfree (p_blocknr_array); + } +} +#endif /* GET_MANY_BLOCKNRS */ + diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/dir.c linux/fs/reiserfs/dir.c --- v2.4.0/linux/fs/reiserfs/dir.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/dir.c Mon Jan 15 15:31:19 2001 @@ -0,0 +1,249 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include +#include +#include + +#else + +#include "nokernel.h" + +#endif + +extern struct key MIN_KEY; + +static int reiserfs_readdir (struct file *, void *, filldir_t); +int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, int datasync) ; + +struct file_operations reiserfs_dir_operations = { + read: generic_read_dir, + readdir: reiserfs_readdir, + fsync: reiserfs_dir_fsync, +}; + +/* + * directories can handle most operations... + */ +struct inode_operations reiserfs_dir_inode_operations = { + //&reiserfs_dir_operations, /* default_file_ops */ + create: reiserfs_create, + lookup: reiserfs_lookup, + link: reiserfs_link, + unlink: reiserfs_unlink, + symlink: reiserfs_symlink, + mkdir: reiserfs_mkdir, + rmdir: reiserfs_rmdir, + mknod: reiserfs_mknod, + rename: reiserfs_rename, +}; + +int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry, int datasync) { + int ret = 0 ; + int windex ; + struct reiserfs_transaction_handle th ; + + journal_begin(&th, dentry->d_inode->i_sb, 1) ; + windex = push_journal_writer("dir_fsync") ; + reiserfs_prepare_for_journal(th.t_super, SB_BUFFER_WITH_SB(th.t_super), 1) ; + journal_mark_dirty(&th, dentry->d_inode->i_sb, SB_BUFFER_WITH_SB (dentry->d_inode->i_sb)) ; + pop_journal_writer(windex) ; + journal_end_sync(&th, dentry->d_inode->i_sb, 1) ; + + return ret ; +} + + +#define store_ih(where,what) copy_item_head (where, what) + +// +static int reiserfs_readdir (struct file * filp, void * dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ + INITIALIZE_PATH (path_to_entry); + struct buffer_head * bh; + int item_num, entry_num; + struct key * rkey; + struct item_head * ih, tmp_ih; + int search_res; + char * local_buf; + loff_t next_pos; + char small_buf[32] ; /* avoid kmalloc if we can */ + struct reiserfs_dir_entry de; + + + reiserfs_check_lock_depth("readdir") ; + + /* form key for search the next directory entry using f_pos field of + file structure */ + make_cpu_key (&pos_key, inode, (filp->f_pos) ? (filp->f_pos) : DOT_OFFSET, + TYPE_DIRENTRY, 3); + next_pos = cpu_key_k_offset (&pos_key); + + /* reiserfs_warning ("reiserfs_readdir 1: f_pos = %Ld\n", filp->f_pos);*/ + + while (1) { + research: + /* search the directory item, containing entry with specified key */ + search_res = search_by_entry_key (inode->i_sb, &pos_key, &path_to_entry, &de); + if (search_res == IO_ERROR) { + // FIXME: we could just skip part of directory which could + // not be read + return -EIO; + } + entry_num = de.de_entry_num; + bh = de.de_bh; + item_num = de.de_item_num; + ih = de.de_ih; + store_ih (&tmp_ih, ih); + +#ifdef CONFIG_REISERFS_CHECK + /* we must have found item, that is item of this directory, */ + if (COMP_SHORT_KEYS (&(ih->ih_key), &pos_key)) + reiserfs_panic (inode->i_sb, "vs-9000: reiserfs_readdir: " + "found item %h does not match to dir we readdir %k", + ih, &pos_key); + + if (item_num > B_NR_ITEMS (bh) - 1) + reiserfs_panic (inode->i_sb, "vs-9005: reiserfs_readdir: " + "item_num == %d, item amount == %d", + item_num, B_NR_ITEMS (bh)); + + /* and entry must be not more than number of entries in the item */ + if (I_ENTRY_COUNT (ih) < entry_num) + reiserfs_panic (inode->i_sb, "vs-9010: reiserfs_readdir: " + "entry number is too big %d (%d)", + entry_num, I_ENTRY_COUNT (ih)); +#endif /* CONFIG_REISERFS_CHECK */ + + if (search_res == POSITION_FOUND || entry_num < I_ENTRY_COUNT (ih)) { + /* go through all entries in the directory item beginning from the entry, that has been found */ + struct reiserfs_de_head * deh = B_I_DEH (bh, ih) + entry_num; + + for (; entry_num < I_ENTRY_COUNT (ih); entry_num ++, deh ++) { + int d_reclen; + char * d_name; + off_t d_off; + ino_t d_ino; + + if (!de_visible (deh)) + /* it is hidden entry */ + continue; + d_reclen = entry_length (bh, ih, entry_num); + d_name = B_I_DEH_ENTRY_FILE_NAME (bh, ih, deh); + if (!d_name[d_reclen - 1]) + d_reclen = strlen (d_name); + + if (d_reclen > REISERFS_MAX_NAME_LEN(inode->i_sb->s_blocksize)){ + /* too big to send back to VFS */ + continue ; + } + d_off = deh_offset (deh); + filp->f_pos = d_off ; + d_ino = deh_objectid (deh); + if (d_reclen <= 32) { + local_buf = small_buf ; + } else { + local_buf = kmalloc(d_reclen, GFP_BUFFER) ; + if (!local_buf) { + pathrelse (&path_to_entry); + return -ENOMEM ; + } + if (item_moved (&tmp_ih, &path_to_entry)) { + kfree(local_buf) ; + goto research; + } + } + // Note, that we copy name to user space via temporary + // buffer (local_buf) because filldir will block if + // user space buffer is swapped out. At that time + // entry can move to somewhere else + memcpy (local_buf, d_name, d_reclen); + if (filldir (dirent, d_name, d_reclen, d_off, d_ino, + DT_UNKNOWN) < 0) { + if (local_buf != small_buf) { + kfree(local_buf) ; + } + goto end; + } + if (local_buf != small_buf) { + kfree(local_buf) ; + } + + // next entry should be looked for with such offset + next_pos = deh_offset (deh) + 1; + + if (item_moved (&tmp_ih, &path_to_entry)) { + reiserfs_warning ("vs-9020: reiserfs_readdir " + "things are moving under hands. Researching..\n"); + goto research; + } + } /* for */ + } + + if (item_num != B_NR_ITEMS (bh) - 1) + // end of directory has been reached + goto end; + + /* item we went through is last item of node. Using right + delimiting key check is it directory end */ + rkey = get_rkey (&path_to_entry, inode->i_sb); + if (! comp_le_keys (rkey, &MIN_KEY)) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning ("vs-9025: reiserfs_readdir:" + "get_rkey failed. Researching..\n"); +#endif + /* set pos_key to key, that is the smallest and greater + that key of the last entry in the item */ + set_cpu_key_k_offset (&pos_key, next_pos); + continue; + } + + if ( COMP_SHORT_KEYS (rkey, &pos_key)) { + // end of directory has been reached + goto end; + } + + /* directory continues in the right neighboring block */ + set_cpu_key_k_offset (&pos_key, le_key_k_offset (ITEM_VERSION_1, rkey)); + + } /* while */ + + + end: + // FIXME: ext2_readdir does not reset f_pos + filp->f_pos = next_pos; + pathrelse (&path_to_entry); + reiserfs_check_path(&path_to_entry) ; + return 0; +} + + + + + + + + + + + + + + + + + + + + + diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/do_balan.c linux/fs/reiserfs/do_balan.c --- v2.4.0/linux/fs/reiserfs/do_balan.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/do_balan.c Mon Jan 15 15:31:19 2001 @@ -0,0 +1,2043 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* Now we have all buffers that must be used in balancing of the tree */ +/* Further calculations can not cause schedule(), and thus the buffer */ +/* tree will be stable until the balancing will be finished */ +/* balance the tree according to the analysis made before, */ +/* and using buffers obtained after all above. */ + + +/** + ** balance_leaf_when_delete + ** balance_leaf + ** do_balance + ** + **/ + +#ifdef __KERNEL__ + +#include +#include +#include +#include + +#else + +#include "nokernel.h" + +#endif + + +#ifdef CONFIG_REISERFS_CHECK + +struct tree_balance * cur_tb = NULL; /* detects whether more than one + copy of tb exists as a means + of checking whether schedule + is interrupting do_balance */ +#endif + + +inline void do_balance_mark_leaf_dirty (struct tree_balance * tb, + struct buffer_head * bh, int flag) +{ + if (reiserfs_dont_log(tb->tb_sb)) { + if (!test_and_set_bit(BH_Dirty, &bh->b_state)) { + __mark_buffer_dirty(bh) ; + tb->need_balance_dirty = 1; + } + } else { + int windex = push_journal_writer("do_balance") ; + journal_mark_dirty(tb->transaction_handle, tb->transaction_handle->t_super, bh) ; + pop_journal_writer(windex) ; + } +} + +#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty +#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty + + +/* summary: + if deleting something ( tb->insert_size[0] < 0 ) + return(balance_leaf_when_delete()); (flag d handled here) + else + if lnum is larger than 0 we put items into the left node + if rnum is larger than 0 we put items into the right node + if snum1 is larger than 0 we put items into the new node s1 + if snum2 is larger than 0 we put items into the new node s2 +Note that all *num* count new items being created. + +It would be easier to read balance_leaf() if each of these summary +lines was a separate procedure rather than being inlined. I think +that there are many passages here and in balance_leaf_when_delete() in +which two calls to one procedure can replace two passages, and it +might save cache space and improve software maintenance costs to do so. + +Vladimir made the perceptive comment that we should offload most of +the decision making in this function into fix_nodes/check_balance, and +then create some sort of structure in tb that says what actions should +be performed by do_balance. + +-Hans */ + + + +/* Balance leaf node in case of delete or cut: insert_size[0] < 0 + * + * lnum, rnum can have values >= -1 + * -1 means that the neighbor must be joined with S + * 0 means that nothing should be done with the neighbor + * >0 means to shift entirely or partly the specified number of items to the neighbor + */ +static int balance_leaf_when_delete (struct tree_balance * tb, int flag) +{ + struct buffer_head * tbS0 = PATH_PLAST_BUFFER (tb->tb_path); + int item_pos = PATH_LAST_POSITION (tb->tb_path); + int pos_in_item = tb->tb_path->pos_in_item; + struct buffer_info bi; + int n; + struct item_head * ih; + +#ifdef CONFIG_REISERFS_CHECK + if ( tb->FR[0] && B_LEVEL (tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1) + reiserfs_panic (tb->tb_sb, + "vs- 12000: balance_leaf_when_delete:level: wrong FR %z\n", tb->FR[0]); + if ( tb->blknum[0] > 1 ) + reiserfs_panic (tb->tb_sb, + "PAP-12005: balance_leaf_when_delete: " + "tb->blknum == %d, can not be > 1", tb->blknum[0]); + + if ( ! tb->blknum[0] && ! PATH_H_PPARENT(tb->tb_path, 0)) + reiserfs_panic (tb->tb_sb, "PAP-12010: balance_leaf_when_delete: tree can not be empty"); +#endif + + ih = B_N_PITEM_HEAD (tbS0, item_pos); + + /* Delete or truncate the item */ + + switch (flag) { + case M_DELETE: /* delete item in S[0] */ + +#ifdef CONFIG_REISERFS_CHECK + if (le16_to_cpu (ih->ih_item_len) + IH_SIZE != -tb->insert_size [0]) + reiserfs_panic (tb->tb_sb, "vs-12013: balance_leaf_when_delete: " + "mode Delete, insert size %d, ih to be deleted %h", ih); + +#if 0 /* rigth delim key not supported */ + if ( ! item_pos && (! tb->L[0] || COMP_KEYS(B_PRIGHT_DELIM_KEY(tb->L[0]), B_N_PKEY(tbS0, 0))) ) { + print_cur_tb ("12015"); + reiserfs_panic (tb->tb_sb, "PAP-12015: balance_leaf_when_delete: L0's rkey does not match to 1st key of S0: " + "rkey in L %k, first key in S0 %k, rkey in CFL %k", + tb->L[0] ? B_PRIGHT_DELIM_KEY(tb->L[0]) : 0, + B_N_PKEY(tbS0, 0), + tb->CFL[0] ? B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0]) : 0); + } +#endif + +#endif + + bi.tb = tb; + bi.bi_bh = tbS0; + bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); + bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); + leaf_delete_items (&bi, 0, item_pos, 1, -1); + + if ( ! item_pos && tb->CFL[0] ) { + if ( B_NR_ITEMS(tbS0) ) { + replace_key(tb, tb->CFL[0],tb->lkey[0],tbS0,0); +#if 0 /* right delim key support */ + copy_key(B_PRIGHT_DELIM_KEY(tb->L[0]), B_N_PKEY(tbS0, 0)); + reiserfs_mark_buffer_dirty (tb->L[0], 0); +#endif + } + else { + if ( ! PATH_H_POSITION (tb->tb_path, 1) ) + replace_key(tb, tb->CFL[0],tb->lkey[0],PATH_H_PPARENT(tb->tb_path, 0),0); +#if 0 /* right delim key support */ + copy_key(B_PRIGHT_DELIM_KEY(tb->L[0]), B_PRIGHT_DELIM_KEY(tbS0)); + reiserfs_mark_buffer_dirty (tb->L[0], 0); +#endif + } + } + +#ifdef CONFIG_REISERFS_CHECK +#if 0 + if (! item_pos && (!tb->CFL[0] || !tb->L[0])) +#endif + if (! item_pos && !tb->CFL[0]) + reiserfs_panic (tb->tb_sb, "PAP-12020: balance_leaf_when_delete: tb->CFL[0]==%p, tb->L[0]==%p", tb->CFL[0], tb->L[0]); +#endif + + break; + + case M_CUT: { /* cut item in S[0] */ + bi.tb = tb; + bi.bi_bh = tbS0; + bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); + bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); + if (is_direntry_le_ih (ih)) { + +#ifdef CONFIG_REISERFS_CHECK +#if 0 /* right delim key support */ + if ( ! item_pos && ! pos_in_item && (! tb->L[0] || COMP_KEYS(B_PRIGHT_DELIM_KEY(tb->L[0]), + B_N_PKEY(tbS0, 0))) ) + reiserfs_panic(tb->tb_sb, "PAP-12025: balance_leaf_when_delete: illegal right delimiting key"); +#endif +#endif + + /* UFS unlink semantics are such that you can only delete one directory entry at a time. */ + /* when we cut a directory tb->insert_size[0] means number of entries to be cut (always 1) */ + tb->insert_size[0] = -1; + leaf_cut_from_buffer (&bi, item_pos, pos_in_item, -tb->insert_size[0]); + +#ifdef CONFIG_REISERFS_CHECK + if (! item_pos && ! pos_in_item && ! tb->CFL[0]) + reiserfs_panic (tb->tb_sb, "PAP-12030: balance_leaf_when_delete: can not change delimiting key. CFL[0]=%p", tb->CFL[0]); +#endif /* CONFIG_REISERFS_CHECK */ + + if ( ! item_pos && ! pos_in_item && tb->CFL[0] ) { + replace_key(tb, tb->CFL[0],tb->lkey[0],tbS0,0); +#if 0/* right delim key support */ + copy_key(B_PRIGHT_DELIM_KEY(tb->L[0]), B_N_PKEY(tbS0, 0)); + reiserfs_mark_buffer_dirty (tb->L[0], 0); +#endif + } + } else { + leaf_cut_from_buffer (&bi, item_pos, pos_in_item, -tb->insert_size[0]); + +#ifdef CONFIG_REISERFS_CHECK + if (! ih->ih_item_len) + reiserfs_panic (tb->tb_sb, "PAP-12035: balance_leaf_when_delete: cut must leave non-zero dynamic length of item"); +#endif /* CONFIG_REISERFS_CHECK */ + } + break; + } + + default: + print_cur_tb ("12040"); + reiserfs_panic (tb->tb_sb, "PAP-12040: balance_leaf_when_delete: unexpectable mode: %s(%d)", + (flag == M_PASTE) ? "PASTE" : ((flag == M_INSERT) ? "INSERT" : "UNKNOWN"), flag); + } + + /* the rule is that no shifting occurs unless by shifting a node can be freed */ + n = B_NR_ITEMS(tbS0); + if ( tb->lnum[0] ) /* L[0] takes part in balancing */ + { + if ( tb->lnum[0] == -1 ) /* L[0] must be joined with S[0] */ + { + if ( tb->rnum[0] == -1 ) /* R[0] must be also joined with S[0] */ + { + if ( tb->FR[0] == PATH_H_PPARENT(tb->tb_path, 0) ) + { + /* all contents of all the 3 buffers will be in L[0] */ + if ( PATH_H_POSITION (tb->tb_path, 1) == 0 && 1 < B_NR_ITEMS(tb->FR[0]) ) + replace_key(tb, tb->CFL[0],tb->lkey[0],tb->FR[0],1); + + /* update right_delimiting_key field */ +#if 0 + copy_key (B_PRIGHT_DELIM_KEY (tb->L[0]), B_PRIGHT_DELIM_KEY (tb->R[0])); +#endif + leaf_move_items (LEAF_FROM_S_TO_L, tb, n, -1, 0); + leaf_move_items (LEAF_FROM_R_TO_L, tb, B_NR_ITEMS(tb->R[0]), -1, 0); + +#if 0/*preserve list*/ + preserve_invalidate(tb, tbS0, tb->L[0]); + preserve_invalidate(tb, tb->R[0], tb->L[0]); +#endif + reiserfs_invalidate_buffer (tb, tbS0); + reiserfs_invalidate_buffer (tb, tb->R[0]); + + return 0; + } + /* all contents of all the 3 buffers will be in R[0] */ + leaf_move_items (LEAF_FROM_S_TO_R, tb, n, -1, 0); + leaf_move_items (LEAF_FROM_L_TO_R, tb, B_NR_ITEMS(tb->L[0]), -1, 0); + + /* right_delimiting_key is correct in R[0] */ + replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); + +#if 0 + /* mark tb->R[0] as suspected recipient */ + preserve_invalidate(tb,tbS0, tb->R[0]); + preserve_invalidate(tb,tb->L[0], tb->R[0]); +#endif + reiserfs_invalidate_buffer (tb, tbS0); + reiserfs_invalidate_buffer (tb, tb->L[0]); + + return -1; + } + +#ifdef CONFIG_REISERFS_CHECK + if ( tb->rnum[0] != 0 ) + reiserfs_panic (tb->tb_sb, "PAP-12045: balance_leaf_when_delete: " + "rnum must be 0 (%d)", tb->rnum[0]); +#endif /* CONFIG_REISERFS_CHECK */ + + /* all contents of L[0] and S[0] will be in L[0] */ + leaf_shift_left(tb, n, -1); + +#if 0/*preserve list*/ + preserve_invalidate(tb, tbS0, tb->L[0]); /* preserved, shifting */ +#endif + reiserfs_invalidate_buffer (tb, tbS0); + + return 0; + } + /* a part of contents of S[0] will be in L[0] and the rest part of S[0] will be in R[0] */ + +#ifdef CONFIG_REISERFS_CHECK + if (( tb->lnum[0] + tb->rnum[0] < n ) || ( tb->lnum[0] + tb->rnum[0] > n+1 )) + reiserfs_panic (tb->tb_sb, "PAP-12050: balance_leaf_when_delete: " + "rnum(%d) and lnum(%d) and item number in S[0] are not consistent", + tb->rnum[0], tb->lnum[0], n); + + if (( tb->lnum[0] + tb->rnum[0] == n ) && (tb->lbytes != -1 || tb->rbytes != -1)) + reiserfs_panic (tb->tb_sb, "PAP-12055: balance_leaf_when_delete: " + "bad rbytes (%d)/lbytes (%d) parameters when items are not split", + tb->rbytes, tb->lbytes); + if (( tb->lnum[0] + tb->rnum[0] == n + 1 ) && (tb->lbytes < 1 || tb->rbytes != -1)) + reiserfs_panic (tb->tb_sb, "PAP-12060: balance_leaf_when_delete: " + "bad rbytes (%d)/lbytes (%d) parameters when items are split", + tb->rbytes, tb->lbytes); +#endif + + leaf_shift_left (tb, tb->lnum[0], tb->lbytes); + leaf_shift_right(tb, tb->rnum[0], tb->rbytes); + +#if 0/*preserve list*/ + preserve_invalidate (tb, tbS0, tb->L[0]); + mark_suspected_recipient (tb->tb_sb, tb->R[0]); +#endif + reiserfs_invalidate_buffer (tb, tbS0); + + return 0; + } + + if ( tb->rnum[0] == -1 ) { + /* all contents of R[0] and S[0] will be in R[0] */ + leaf_shift_right(tb, n, -1); +#if 0/*preserve list*/ + preserve_invalidate(tb, tbS0, tb->R[0]); +#endif + reiserfs_invalidate_buffer (tb, tbS0); + return 0; + } + +#ifdef CONFIG_REISERFS_CHECK + if ( tb->rnum[0] ) + reiserfs_panic (tb->tb_sb, "PAP-12065: balance_leaf_when_delete: " + "bad rnum parameter must be 0 (%d)", tb->rnum[0]); +#endif + + return 0; +} + + +static int balance_leaf (struct tree_balance * tb, + struct item_head * ih, /* item header of inserted item (this is on little endian) */ + const char * body, /* body of inserted item or bytes to paste */ + int flag, /* i - insert, d - delete, c - cut, p - paste + (see comment to do_balance) */ + struct item_head * insert_key, /* in our processing of one level we sometimes determine what + must be inserted into the next higher level. This insertion + consists of a key or two keys and their corresponding + pointers */ + struct buffer_head ** insert_ptr /* inserted node-ptrs for the next level */ + ) +{ + struct buffer_head * tbS0 = PATH_PLAST_BUFFER (tb->tb_path); +#if 0/*preserve list*/ + struct buffer_head * tbF0 = PATH_H_PPARENT (tb->tb_path, 0); + int S0_b_item_order = PATH_H_B_ITEM_ORDER (tb->tb_path, 0); +#endif + int item_pos = PATH_LAST_POSITION (tb->tb_path); /* index into the array of item headers in S[0] + of the affected item */ + struct buffer_info bi; + struct buffer_head *S_new[2]; /* new nodes allocated to hold what could not fit into S */ + int snum[2]; /* number of items that will be placed + into S_new (includes partially shifted + items) */ + int sbytes[2]; /* if an item is partially shifted into S_new then + if it is a directory item + it is the number of entries from the item that are shifted into S_new + else + it is the number of bytes from the item that are shifted into S_new + */ + int n, i; + int ret_val; + int pos_in_item; + int zeros_num; + +#if 0 + if (tb->insert_size [0] % 4) { + reiserfs_panic (tb->tb_sb, "balance_leaf: wrong insert_size %d", + tb->insert_size [0]); + } +#endif + /* Make balance in case insert_size[0] < 0 */ + if ( tb->insert_size[0] < 0 ) + return balance_leaf_when_delete (tb, flag); + + zeros_num = 0; + if (flag == M_INSERT && body == 0) + zeros_num = le16_to_cpu (ih->ih_item_len); + + pos_in_item = tb->tb_path->pos_in_item; + /* for indirect item pos_in_item is measured in unformatted node + pointers. Recalculate to bytes */ + if (flag != M_INSERT && is_indirect_le_ih (B_N_PITEM_HEAD (tbS0, item_pos))) + pos_in_item *= UNFM_P_SIZE; + + if ( tb->lnum[0] > 0 ) { + /* Shift lnum[0] items from S[0] to the left neighbor L[0] */ + if ( item_pos < tb->lnum[0] ) { + /* new item or it part falls to L[0], shift it too */ + n = B_NR_ITEMS(tb->L[0]); + + switch (flag) { + case M_INSERT: /* insert item into L[0] */ + + if ( item_pos == tb->lnum[0] - 1 && tb->lbytes != -1 ) { + /* part of new item falls into L[0] */ + int new_item_len; + int version; + +#ifdef CONFIG_REISERFS_CHECK + if (!is_direct_le_ih (ih)) + reiserfs_panic (tb->tb_sb, "PAP-12075: balance_leaf: " + "only direct inserted item can be broken. %h", ih); +#endif + ret_val = leaf_shift_left (tb, tb->lnum[0]-1, -1); + /* when reading the if conditions preceding the subsequent preserve_shifted + lines understand that their goal is to determine if all that we are + shifting is the new data being added */ +#if 0/*preserve list*/ + if (tb->lnum[0] - 1 > 0) { + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, tb->L[0]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); + } +#endif + + /* Calculate item length to insert to S[0] */ + new_item_len = le16_to_cpu (ih->ih_item_len) - tb->lbytes; + /* Calculate and check item length to insert to L[0] */ + ih->ih_item_len -= new_item_len; + +#ifdef CONFIG_REISERFS_CHECK + if ( (int)(ih->ih_item_len) <= 0 ) + reiserfs_panic(tb->tb_sb, "PAP-12080: balance_leaf: " + "there is nothing to insert into L[0]: ih_item_len=%d", + (int)ih->ih_item_len); +#endif + + /* Insert new item into L[0] */ + bi.tb = tb; + bi.bi_bh = tb->L[0]; + bi.bi_parent = tb->FL[0]; + bi.bi_position = get_left_neighbor_position (tb, 0); + leaf_insert_into_buf (&bi, n + item_pos - ret_val, ih, body, + zeros_num > ih->ih_item_len ? ih->ih_item_len : zeros_num); + + version = ih_version (ih); + + /* Calculate key component, item length and body to insert into S[0] */ + set_le_key_k_offset (ih_version (ih), &(ih->ih_key), + le_key_k_offset (ih_version (ih), &(ih->ih_key)) + tb->lbytes); + ih->ih_item_len = cpu_to_le16 (new_item_len); + if ( tb->lbytes > zeros_num ) { + body += (tb->lbytes - zeros_num); + zeros_num = 0; + } + else + zeros_num -= tb->lbytes; + +#ifdef CONFIG_REISERFS_CHECK + if ( (int)(ih->ih_item_len) <= 0 ) + reiserfs_panic(tb->tb_sb, "PAP-12085: balance_leaf: " + "there is nothing to insert into S[0]: ih_item_len=%d", + (int)ih->ih_item_len); +#endif + } else { + /* new item in whole falls into L[0] */ + /* Shift lnum[0]-1 items to L[0] */ + ret_val = leaf_shift_left(tb, tb->lnum[0]-1, tb->lbytes); +#if 0/*preserve list*/ + if (tb->lnum[0] > 1) { + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, tb->L[0]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); + } +#endif + /* Insert new item into L[0] */ + bi.tb = tb; + bi.bi_bh = tb->L[0]; + bi.bi_parent = tb->FL[0]; + bi.bi_position = get_left_neighbor_position (tb, 0); + leaf_insert_into_buf (&bi, n + item_pos - ret_val, ih, body, zeros_num); +#if 0/*preserve list*/ + if (tb->preserve_mode == PRESERVE_INDIRECT_TO_DIRECT){ + mark_suspected_recipient (tb->tb_sb, bi.bi_bh); + } +#endif + tb->insert_size[0] = 0; + zeros_num = 0; + } + break; + + case M_PASTE: /* append item in L[0] */ + + if ( item_pos == tb->lnum[0] - 1 && tb->lbytes != -1 ) { + /* we must shift the part of the appended item */ + if ( is_direntry_le_ih (B_N_PITEM_HEAD (tbS0, item_pos))) { + +#ifdef CONFIG_REISERFS_CHECK + if ( zeros_num ) + reiserfs_panic(tb->tb_sb, "PAP-12090: balance_leaf: illegal parameter in case of a directory"); +#endif + + /* directory item */ + if ( tb->lbytes > pos_in_item ) { + /* new directory entry falls into L[0] */ + struct item_head * pasted; + int l_pos_in_item = pos_in_item; + + /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 entries from given directory item */ + ret_val = leaf_shift_left(tb, tb->lnum[0], tb->lbytes - 1); +#if 0/*preserve list*/ + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, tb->L[0]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); +#endif + if ( ret_val && ! item_pos ) { + pasted = B_N_PITEM_HEAD(tb->L[0],B_NR_ITEMS(tb->L[0])-1); + l_pos_in_item += I_ENTRY_COUNT(pasted) - (tb->lbytes-1); + } + + /* Append given directory entry to directory item */ + bi.tb = tb; + bi.bi_bh = tb->L[0]; + bi.bi_parent = tb->FL[0]; + bi.bi_position = get_left_neighbor_position (tb, 0); + leaf_paste_in_buffer (&bi, n + item_pos - ret_val, l_pos_in_item, + tb->insert_size[0], body, zeros_num); + + /* previous string prepared space for pasting new entry, following string pastes this entry */ + + /* when we have merge directory item, pos_in_item has been changed too */ + + /* paste new directory entry. 1 is entry number */ + leaf_paste_entries (bi.bi_bh, n + item_pos - ret_val, l_pos_in_item, 1, + (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0] + ); + tb->insert_size[0] = 0; + } else { + /* new directory item doesn't fall into L[0] */ + /* Shift lnum[0]-1 items in whole. Shift lbytes directory entries from directory item number lnum[0] */ + leaf_shift_left (tb, tb->lnum[0], tb->lbytes); +#if 0/*preserve list*/ + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, tb->L[0]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); +#endif + } + /* Calculate new position to append in item body */ + pos_in_item -= tb->lbytes; + } + else { + /* regular object */ + +#ifdef CONFIG_REISERFS_CHECK + if ( tb->lbytes <= 0 ) + reiserfs_panic(tb->tb_sb, "PAP-12095: balance_leaf: " + "there is nothing to shift to L[0]. lbytes=%d", + tb->lbytes); + if ( pos_in_item != B_N_PITEM_HEAD(tbS0, item_pos)->ih_item_len ) + reiserfs_panic(tb->tb_sb, "PAP-12100: balance_leaf: " + "incorrect position to paste: item_len=%d, pos_in_item=%d", + B_N_PITEM_HEAD(tbS0,item_pos)->ih_item_len, pos_in_item); +#endif + + if ( tb->lbytes >= pos_in_item ) { + /* appended item will be in L[0] in whole */ + int l_n; + + /* this bytes number must be appended to the last item of L[h] */ + l_n = tb->lbytes - pos_in_item; + + /* Calculate new insert_size[0] */ + tb->insert_size[0] -= l_n; + +#ifdef CONFIG_REISERFS_CHECK + if ( tb->insert_size[0] <= 0 ) + reiserfs_panic(tb->tb_sb, "PAP-12105: balance_leaf: " + "there is nothing to paste into L[0]. insert_size=%d", + tb->insert_size[0]); +#endif + + ret_val = leaf_shift_left(tb,tb->lnum[0], + B_N_PITEM_HEAD(tbS0,item_pos)->ih_item_len); +#if 0/*preserve list*/ + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, tb->L[0]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); +#endif + /* Append to body of item in L[0] */ + bi.tb = tb; + bi.bi_bh = tb->L[0]; + bi.bi_parent = tb->FL[0]; + bi.bi_position = get_left_neighbor_position (tb, 0); + leaf_paste_in_buffer( + &bi,n + item_pos - ret_val, + B_N_PITEM_HEAD(tb->L[0],n+item_pos-ret_val)->ih_item_len, + l_n,body, zeros_num > l_n ? l_n : zeros_num + ); + +#ifdef CONFIG_REISERFS_CHECK + if (l_n && is_indirect_le_ih(B_N_PITEM_HEAD(tb->L[0], + n + item_pos - ret_val))) + reiserfs_panic(tb->tb_sb, "PAP-12110: balance_leaf: " + "pasting more than 1 unformatted node pointer into indirect item"); +#endif + + /* 0-th item in S0 can be only of DIRECT type when l_n != 0*/ + { + int version; + + version = le16_to_cpu (B_N_PITEM_HEAD (tbS0, 0)->ih_version); + set_le_key_k_offset (version, B_N_PKEY (tbS0, 0), + le_key_k_offset (version, B_N_PKEY (tbS0, 0)) + l_n); + set_le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0]), + le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0])) + l_n); + } +#if 0 + set_le_key_k_offset (B_PRIGHT_DELIM_KEY(tb->L[0]), le_key_k_offset (B_PRIGHT_DELIM_KEY(tb->L[0])) + l_n); +#endif + /* k_offset (B_N_PKEY (tbS0, 0)) += l_n; + k_offset (B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0])) += l_n; + k_offset (B_PRIGHT_DELIM_KEY(tb->L[0])) += l_n;*/ + +#ifdef NO_CONFIG_REISERFS_CHECK /* journal victim */ + if (!buffer_dirty (tbS0) || !buffer_dirty (tb->CFL[0]) || !buffer_dirty (tb->L[0])) + reiserfs_panic(tb->tb_sb, "PAP-12115: balance_leaf: L, CLF and S must be dirty already"); +#endif + + /* Calculate new body, position in item and insert_size[0] */ + if ( l_n > zeros_num ) { + body += (l_n - zeros_num); + zeros_num = 0; + } + else + zeros_num -= l_n; + pos_in_item = 0; + +#ifdef CONFIG_REISERFS_CHECK + if (comp_short_le_keys (B_N_PKEY(tbS0,0), + B_N_PKEY(tb->L[0],B_NR_ITEMS(tb->L[0])-1)) || + !op_is_left_mergeable (B_N_PKEY (tbS0, 0), tbS0->b_size) || + !op_is_left_mergeable(B_N_PDELIM_KEY(tb->CFL[0],tb->lkey[0]), tbS0->b_size)) + reiserfs_panic (tb->tb_sb, "PAP-12120: balance_leaf: " + "item must be merge-able with left neighboring item"); +#endif + + } + else /* only part of the appended item will be in L[0] */ + { + /* Calculate position in item for append in S[0] */ + pos_in_item -= tb->lbytes; + +#ifdef CONFIG_REISERFS_CHECK + if ( pos_in_item <= 0 ) + reiserfs_panic(tb->tb_sb, "PAP-12125: balance_leaf: " + "no place for paste. pos_in_item=%d", pos_in_item); +#endif + + /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ + leaf_shift_left(tb,tb->lnum[0],tb->lbytes); +#if 0/*preserve list*/ + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, tb->L[0]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); +#endif + } + } + } + else /* appended item will be in L[0] in whole */ + { + struct item_head * pasted; + +#ifdef REISERFS_FSCK + if ( ! item_pos && is_left_mergeable (tb->tb_sb, tb->tb_path) == 1 ) +#else + if ( ! item_pos && op_is_left_mergeable (B_N_PKEY (tbS0, 0), tbS0->b_size) ) +#endif + { /* if we paste into first item of S[0] and it is left mergable */ + /* then increment pos_in_item by the size of the last item in L[0] */ + pasted = B_N_PITEM_HEAD(tb->L[0],n-1); + if ( is_direntry_le_ih (pasted) ) + pos_in_item += le16_to_cpu (pasted->u.ih_entry_count); + else + pos_in_item += le16_to_cpu (pasted->ih_item_len); + } + + /* Shift lnum[0] - 1 items in whole. Shift lbytes - 1 byte from item number lnum[0] */ + ret_val = leaf_shift_left(tb,tb->lnum[0],tb->lbytes); +#if 0/*preserve list*/ + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, tb->L[0]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); +#endif + /* Append to body of item in L[0] */ + bi.tb = tb; + bi.bi_bh = tb->L[0]; + bi.bi_parent = tb->FL[0]; + bi.bi_position = get_left_neighbor_position (tb, 0); + leaf_paste_in_buffer (&bi, n + item_pos - ret_val, pos_in_item, tb->insert_size[0], + body, zeros_num); + + /* if appended item is directory, paste entry */ + pasted = B_N_PITEM_HEAD (tb->L[0], n + item_pos - ret_val); + if (is_direntry_le_ih (pasted)) + leaf_paste_entries ( + bi.bi_bh, n + item_pos - ret_val, pos_in_item, 1, + (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0] + ); + /* if appended item is indirect item, put unformatted node into un list */ + if (is_indirect_le_ih (pasted)) + set_ih_free_space (pasted, ((struct unfm_nodeinfo*)body)->unfm_freespace); + tb->insert_size[0] = 0; + zeros_num = 0; + } + break; + default: /* cases d and t */ + reiserfs_panic (tb->tb_sb, "PAP-12130: balance_leaf: lnum > 0: unexpectable mode: %s(%d)", + (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag); + } + } else { + /* new item doesn't fall into L[0] */ + leaf_shift_left(tb,tb->lnum[0],tb->lbytes); +#if 0/*preserve list*/ + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, tb->L[0]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); +#endif + } + } /* tb->lnum[0] > 0 */ + + /* Calculate new item position */ + item_pos -= ( tb->lnum[0] - (( tb->lbytes != -1 ) ? 1 : 0)); + + if ( tb->rnum[0] > 0 ) { + /* shift rnum[0] items from S[0] to the right neighbor R[0] */ + n = B_NR_ITEMS(tbS0); + switch ( flag ) { + + case M_INSERT: /* insert item */ + if ( n - tb->rnum[0] < item_pos ) + { /* new item or its part falls to R[0] */ + if ( item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1 ) + { /* part of new item falls into R[0] */ + int old_key_comp, old_len, r_zeros_number; + const char * r_body; + int version; + loff_t offset; + +#ifdef CONFIG_REISERFS_CHECK + if ( !is_direct_le_ih (ih) ) + reiserfs_panic(tb->tb_sb, "PAP-12135: balance_leaf: " + "only direct item can be split. (%h)", ih); +#endif + + leaf_shift_right(tb,tb->rnum[0]-1,-1); +#if 0/*preserve list*/ + if (tb->rnum[0]>1) { + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, tb->R[0]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); + } +#endif + + version = le16_to_cpu (ih->ih_version); + /* Remember key component and item length */ + old_key_comp = le_key_k_offset (version, &(ih->ih_key)); + old_len = le16_to_cpu (ih->ih_item_len); + + /* Calculate key component and item length to insert into R[0] */ + offset = le_key_k_offset (version, &(ih->ih_key)) + (old_len - tb->rbytes); + set_le_key_k_offset (version, &(ih->ih_key), offset); + ih->ih_item_len = cpu_to_le16 (tb->rbytes); + /* Insert part of the item into R[0] */ + bi.tb = tb; + bi.bi_bh = tb->R[0]; + bi.bi_parent = tb->FR[0]; + bi.bi_position = get_right_neighbor_position (tb, 0); + if ( offset - old_key_comp > zeros_num ) { + r_zeros_number = 0; + r_body = body + offset - old_key_comp - zeros_num; + } + else { + r_body = body; + r_zeros_number = zeros_num - (offset - old_key_comp); + zeros_num -= r_zeros_number; + } + + leaf_insert_into_buf (&bi, 0, ih, r_body, r_zeros_number); + + /* Replace right delimiting key by first key in R[0] */ + replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); + + /* Calculate key component and item length to insert into S[0] */ + set_le_key_k_offset (version, &(ih->ih_key), old_key_comp); + ih->ih_item_len = cpu_to_le16 (old_len - tb->rbytes); + + tb->insert_size[0] -= tb->rbytes; + + } + else /* whole new item falls into R[0] */ + { + /* Shift rnum[0]-1 items to R[0] */ + ret_val = leaf_shift_right(tb,tb->rnum[0]-1,tb->rbytes); +#if 0/*preserve list*/ + if (tb->rnum[0]>1) { + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, tb->R[0]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); + } +#endif + /* Insert new item into R[0] */ + bi.tb = tb; + bi.bi_bh = tb->R[0]; + bi.bi_parent = tb->FR[0]; + bi.bi_position = get_right_neighbor_position (tb, 0); + leaf_insert_into_buf (&bi, item_pos - n + tb->rnum[0] - 1, ih, body, zeros_num); +#if 0/*preserve list*/ + if (tb->preserve_mode == PRESERVE_INDIRECT_TO_DIRECT){ + mark_suspected_recipient (tb->tb_sb, bi.bi_bh); + } +#endif + + /* If we insert new item in the begin of R[0] change the right delimiting key */ + if ( item_pos - n + tb->rnum[0] - 1 == 0 ) { + replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); + +#if 0 + /* update right delimiting key */ + copy_key(B_PRIGHT_DELIM_KEY(tbS0), &(ih->ih_key)); + reiserfs_mark_buffer_dirty (tbS0, 0); +#endif + } + zeros_num = tb->insert_size[0] = 0; + } + } + else /* new item or part of it doesn't fall into R[0] */ + { + leaf_shift_right(tb,tb->rnum[0],tb->rbytes); +#if 0/*preserve list*/ + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, tb->R[0]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); +#endif + } + break; + + case M_PASTE: /* append item */ + + if ( n - tb->rnum[0] <= item_pos ) /* pasted item or part of it falls to R[0] */ + { + if ( item_pos == n - tb->rnum[0] && tb->rbytes != -1 ) + { /* we must shift the part of the appended item */ + if ( is_direntry_le_ih (B_N_PITEM_HEAD(tbS0, item_pos))) + { /* we append to directory item */ + int entry_count; + +#ifdef CONFIG_REISERFS_CHECK + if ( zeros_num ) + reiserfs_panic(tb->tb_sb, "PAP-12145: balance_leaf: illegal parametr in case of a directory"); +#endif + + entry_count = I_ENTRY_COUNT(B_N_PITEM_HEAD(tbS0, item_pos)); + if ( entry_count - tb->rbytes < pos_in_item ) + /* new directory entry falls into R[0] */ + { + int paste_entry_position; + +#ifdef CONFIG_REISERFS_CHECK + if ( tb->rbytes - 1 >= entry_count || ! tb->insert_size[0] ) + reiserfs_panic(tb->tb_sb, "PAP-12150: balance_leaf: " + "no enough of entries to shift to R[0]: rbytes=%d, entry_count=%d", + tb->rbytes, entry_count); +#endif + + /* Shift rnum[0]-1 items in whole. Shift rbytes-1 directory entries from directory item number rnum[0] */ + leaf_shift_right(tb,tb->rnum[0],tb->rbytes - 1); +#if 0/*preserve list*/ + /* if we are shifting more than just the new entry */ + if (tb->rbytes > 1 || tb->rnum[0] > 1) { + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, tb->R[0]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); + } +#endif + /* Paste given directory entry to directory item */ + paste_entry_position = pos_in_item - entry_count + tb->rbytes - 1; + bi.tb = tb; + bi.bi_bh = tb->R[0]; + bi.bi_parent = tb->FR[0]; + bi.bi_position = get_right_neighbor_position (tb, 0); + leaf_paste_in_buffer (&bi, 0, paste_entry_position, + tb->insert_size[0],body,zeros_num); + /* paste entry */ + leaf_paste_entries ( + bi.bi_bh, 0, paste_entry_position, 1, (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0] + ); + + if ( paste_entry_position == 0 ) { + /* change delimiting keys */ + replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); +#if 0 + copy_key(B_PRIGHT_DELIM_KEY(tbS0), B_N_PKEY(tb->R[0], 0)); + reiserfs_mark_buffer_dirty (tbS0, 0); +#endif + } + + tb->insert_size[0] = 0; + pos_in_item++; + } + else /* new directory entry doesn't fall into R[0] */ + { + leaf_shift_right(tb,tb->rnum[0],tb->rbytes); +#if 0/*preserve list*/ + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, tb->R[0]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); +#endif + } + } + else /* regular object */ + { + int n_shift, n_rem, r_zeros_number; + const char * r_body; + + /* Calculate number of bytes which must be shifted from appended item */ + if ( (n_shift = tb->rbytes - tb->insert_size[0]) < 0 ) + n_shift = 0; + +#ifdef CONFIG_REISERFS_CHECK + if (pos_in_item != B_N_PITEM_HEAD (tbS0, item_pos)->ih_item_len) + reiserfs_panic(tb->tb_sb,"PAP-12155: balance_leaf: invalid position to paste. ih_item_len=%d, pos_in_item=%d", + pos_in_item, B_N_PITEM_HEAD(tbS0,item_pos)->ih_item_len); +#endif + + leaf_shift_right(tb,tb->rnum[0],n_shift); +#if 0/*preserve list*/ + /* if we are shifting an old part from the appended item or more than the appended item is going into R */ + if (n_shift || tb->rnum[0] > 1) { + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, tb->R[0]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); + } +#endif + /* Calculate number of bytes which must remain in body after appending to R[0] */ + if ( (n_rem = tb->insert_size[0] - tb->rbytes) < 0 ) + n_rem = 0; + + { + int version; + + version = ih_version (B_N_PITEM_HEAD (tb->R[0],0)); + set_le_key_k_offset (version, B_N_PKEY(tb->R[0],0), + le_key_k_offset (version, B_N_PKEY(tb->R[0],0)) + n_rem); + set_le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0]), + le_key_k_offset (version, B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) + n_rem); + } +/* k_offset (B_N_PKEY(tb->R[0],0)) += n_rem; + k_offset (B_N_PDELIM_KEY(tb->CFR[0],tb->rkey[0])) += n_rem;*/ + do_balance_mark_internal_dirty (tb, tb->CFR[0], 0); + +#if 0 + set_le_key_k_offset (B_PRIGHT_DELIM_KEY(tbS0), le_key_k_offset (B_PRIGHT_DELIM_KEY(tbS0)) + n_rem); +/* k_offset (B_PRIGHT_DELIM_KEY(tbS0)) += n_rem;*/ + reiserfs_mark_buffer_dirty (tbS0, 0); +#endif + /* Append part of body into R[0] */ + bi.tb = tb; + bi.bi_bh = tb->R[0]; + bi.bi_parent = tb->FR[0]; + bi.bi_position = get_right_neighbor_position (tb, 0); + if ( n_rem > zeros_num ) { + r_zeros_number = 0; + r_body = body + n_rem - zeros_num; + } + else { + r_body = body; + r_zeros_number = zeros_num - n_rem; + zeros_num -= r_zeros_number; + } + + leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0] - n_rem, r_body, r_zeros_number); + + if (is_indirect_le_ih (B_N_PITEM_HEAD(tb->R[0],0))) { + +#ifdef CONFIG_REISERFS_CHECK + if (n_rem) + reiserfs_panic(tb->tb_sb, "PAP-12160: balance_leaf: paste more than one unformatted node pointer"); +#endif + + set_ih_free_space (B_N_PITEM_HEAD(tb->R[0],0), ((struct unfm_nodeinfo*)body)->unfm_freespace); + } + + tb->insert_size[0] = n_rem; + if ( ! n_rem ) + pos_in_item ++; + } + } + else /* pasted item in whole falls into R[0] */ + { + struct item_head * pasted; + + ret_val = leaf_shift_right(tb,tb->rnum[0],tb->rbytes); +#if 0/*preserve list*/ + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, tb->R[0]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); +#endif + /* append item in R[0] */ + if ( pos_in_item >= 0 ) { + bi.tb = tb; + bi.bi_bh = tb->R[0]; + bi.bi_parent = tb->FR[0]; + bi.bi_position = get_right_neighbor_position (tb, 0); + leaf_paste_in_buffer(&bi,item_pos - n + tb->rnum[0], pos_in_item, + tb->insert_size[0],body, zeros_num); + } + + /* paste new entry, if item is directory item */ + pasted = B_N_PITEM_HEAD(tb->R[0], item_pos - n + tb->rnum[0]); + if (is_direntry_le_ih (pasted) && pos_in_item >= 0 ) { + leaf_paste_entries ( + bi.bi_bh, item_pos - n + tb->rnum[0], pos_in_item, 1, + (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0] + ); + if ( ! pos_in_item ) { + +#ifdef CONFIG_REISERFS_CHECK + if ( item_pos - n + tb->rnum[0] ) + reiserfs_panic (tb->tb_sb, "PAP-12165: balance_leaf: " + "directory item must be first item of node when pasting is in 0th position"); +#endif + + /* update delimiting keys */ + replace_key(tb, tb->CFR[0],tb->rkey[0],tb->R[0],0); +#if 0 + copy_key(B_PRIGHT_DELIM_KEY(tbS0),B_N_PKEY(tb->R[0], 0)); + reiserfs_mark_buffer_dirty (tbS0, 0); +#endif + } + } + + if (is_indirect_le_ih (pasted)) + set_ih_free_space (pasted, ((struct unfm_nodeinfo*)body)->unfm_freespace); + zeros_num = tb->insert_size[0] = 0; + } + } + else /* new item doesn't fall into R[0] */ + { + leaf_shift_right(tb,tb->rnum[0],tb->rbytes); +#if 0/*preserve list*/ + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, tb->R[0]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); +#endif + } + break; + default: /* cases d and t */ + reiserfs_panic (tb->tb_sb, "PAP-12175: balance_leaf: rnum > 0: unexpectable mode: %s(%d)", + (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag); + } + + } /* tb->rnum[0] > 0 */ + + +#ifdef CONFIG_REISERFS_CHECK + if ( tb->blknum[0] > 3 ) + reiserfs_panic (tb->tb_sb, "PAP-12180: balance_leaf: blknum can not be %d. It must be <= 3", tb->blknum[0]); + + if ( tb->blknum[0] < 0 ) + reiserfs_panic (tb->tb_sb, "PAP-12185: balance_leaf: blknum can not be %d. It must be >= 0", tb->blknum[0]); +#endif + + /* if while adding to a node we discover that it is possible to split + it in two, and merge the left part into the left neighbor and the + right part into the right neighbor, eliminating the node */ + if ( tb->blknum[0] == 0 ) { /* node S[0] is empty now */ + +#ifdef CONFIG_REISERFS_CHECK + if ( ! tb->lnum[0] || ! tb->rnum[0] ) + reiserfs_panic(tb->tb_sb, "PAP-12190: balance_leaf: lnum and rnum must not be zero"); +#if 0 + if (COMP_KEYS (B_N_PKEY(tb->R[0], 0), B_PRIGHT_DELIM_KEY(tbS0))) + reiserfs_panic (tb->tb_sb, "vs-12192: balance_leaf: S[0] is being removed from the tree, it has incorrect right delimiting key"); +#endif +#endif + +#if 0 + /* if insertion was done before 0-th position in R[0], right + delimiting key of the tb->L[0]'s and left delimiting key are + not set correctly */ + if (tb->L[0]) { + copy_key(B_PRIGHT_DELIM_KEY(tb->L[0]), B_PRIGHT_DELIM_KEY(tbS0)); + reiserfs_mark_buffer_dirty (tb->L[0], 0); + } + + if (tb->CFL[0]) { + copy_key (B_N_PDELIM_KEY (tb->CFL[0], tb->lkey[0]), B_PRIGHT_DELIM_KEY(tbS0)); + reiserfs_mark_buffer_dirty (tb->CFL[0], 0); + } +#endif + + /* if insertion was done before 0-th position in R[0], right + delimiting key of the tb->L[0]'s and left delimiting key are + not set correctly */ + if (tb->CFL[0]) { + if (!tb->CFR[0]) + reiserfs_panic (tb->tb_sb, "vs-12195: balance_leaf: CFR not initialized"); + copy_key (B_N_PDELIM_KEY (tb->CFL[0], tb->lkey[0]), B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0])); + do_balance_mark_internal_dirty (tb, tb->CFL[0], 0); + } + + reiserfs_invalidate_buffer(tb,tbS0); + return 0; + } + + + /* Fill new nodes that appear in place of S[0] */ + + /* I am told that this copying is because we need an array to enable + the looping code. -Hans */ + snum[0] = tb->s1num, + snum[1] = tb->s2num; + sbytes[0] = tb->s1bytes; + sbytes[1] = tb->s2bytes; + for( i = tb->blknum[0] - 2; i >= 0; i-- ) { + +#ifdef CONFIG_REISERFS_CHECK + if (!snum[i]) + reiserfs_panic(tb->tb_sb,"PAP-12200: balance_leaf: snum[%d] == %d. Must be > 0", i, snum[i]); +#endif /* CONFIG_REISERFS_CHECK */ + + /* here we shift from S to S_new nodes */ + + S_new[i] = get_FEB(tb); + + /* initialized block type and tree level */ + B_BLK_HEAD(S_new[i])->blk_level = cpu_to_le16 (DISK_LEAF_NODE_LEVEL); + + + n = B_NR_ITEMS(tbS0); + + switch (flag) { + case M_INSERT: /* insert item */ + + if ( n - snum[i] < item_pos ) + { /* new item or it's part falls to first new node S_new[i]*/ + if ( item_pos == n - snum[i] + 1 && sbytes[i] != -1 ) + { /* part of new item falls into S_new[i] */ + int old_key_comp, old_len, r_zeros_number; + const char * r_body; + int version; + +#ifdef CONFIG_REISERFS_CHECK + if ( !is_direct_le_ih(ih) ) + /* The items which can be inserted are: + Stat_data item, direct item, indirect item and directory item which consist of only two entries "." and "..". + These items must not be broken except for a direct one. */ + reiserfs_panic(tb->tb_sb, "PAP-12205: balance_leaf: " + "non-direct item can not be broken when inserting"); +#endif + + /* Move snum[i]-1 items from S[0] to S_new[i] */ + leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i] - 1, -1, S_new[i]); +#if 0/*preserve list*/ + if (snum[i] > 1 ) { + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, S_new[i]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); + } +#endif + /* Remember key component and item length */ + version = ih_version (ih); + old_key_comp = le_key_k_offset (version, &(ih->ih_key)); + old_len = le16_to_cpu (ih->ih_item_len); + + /* Calculate key component and item length to insert into S_new[i] */ + set_le_key_k_offset (version, &(ih->ih_key), + le_key_k_offset (version, &(ih->ih_key)) + (old_len - sbytes[i])); + + ih->ih_item_len = cpu_to_le16 (sbytes[i]); + + /* Insert part of the item into S_new[i] before 0-th item */ + bi.tb = tb; + bi.bi_bh = S_new[i]; + bi.bi_parent = 0; + bi.bi_position = 0; + + if ( le_key_k_offset (version, &(ih->ih_key)) - old_key_comp > zeros_num ) { + r_zeros_number = 0; + r_body = body + (le_key_k_offset (version, &(ih->ih_key)) - old_key_comp) - zeros_num; + } + else { + r_body = body; + r_zeros_number = zeros_num - (le_key_k_offset (version, &(ih->ih_key)) - old_key_comp); + zeros_num -= r_zeros_number; + } + + leaf_insert_into_buf (&bi, 0, ih, r_body, r_zeros_number); + + /* Calculate key component and item length to insert into S[i] */ + set_le_key_k_offset (version, &(ih->ih_key), old_key_comp); + ih->ih_item_len = cpu_to_le16 (old_len - sbytes[i]); + tb->insert_size[0] -= sbytes[i]; + } + else /* whole new item falls into S_new[i] */ + { + /* Shift snum[0] - 1 items to S_new[i] (sbytes[i] of split item) */ + leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i] - 1, sbytes[i], S_new[i]); + + /* Insert new item into S_new[i] */ + bi.tb = tb; + bi.bi_bh = S_new[i]; + bi.bi_parent = 0; + bi.bi_position = 0; + leaf_insert_into_buf (&bi, item_pos - n + snum[i] - 1, ih, body, zeros_num); +#if 0/*preserve list*/ + if (tb->preserve_mode == PRESERVE_INDIRECT_TO_DIRECT){ + mark_suspected_recipient (tb->tb_sb, bi.bi_bh); + } +#endif + + zeros_num = tb->insert_size[0] = 0; + } + } + + else /* new item or it part don't falls into S_new[i] */ + { + leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]); +#if 0/*preserve list*/ + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, S_new[i]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); +#endif + } + break; + + case M_PASTE: /* append item */ + + if ( n - snum[i] <= item_pos ) /* pasted item or part if it falls to S_new[i] */ + { + if ( item_pos == n - snum[i] && sbytes[i] != -1 ) + { /* we must shift part of the appended item */ + struct item_head * aux_ih; + +#ifdef CONFIG_REISERFS_CHECK + if ( ih ) + reiserfs_panic (tb->tb_sb, "PAP-12210: balance_leaf: ih must be 0"); +#endif /* CONFIG_REISERFS_CHECK */ + + if ( is_direntry_le_ih (aux_ih = B_N_PITEM_HEAD(tbS0,item_pos))) { + /* we append to directory item */ + + int entry_count; + + entry_count = le16_to_cpu (aux_ih->u.ih_entry_count); + + if ( entry_count - sbytes[i] < pos_in_item && pos_in_item <= entry_count ) { + /* new directory entry falls into S_new[i] */ + +#ifdef CONFIG_REISERFS_CHECK + if ( ! tb->insert_size[0] ) + reiserfs_panic (tb->tb_sb, "PAP-12215: balance_leaif: insert_size is already 0"); + if ( sbytes[i] - 1 >= entry_count ) + reiserfs_panic (tb->tb_sb, "PAP-12220: balance_leaf: " + "there are no so much entries (%d), only %d", + sbytes[i] - 1, entry_count); +#endif + + /* Shift snum[i]-1 items in whole. Shift sbytes[i] directory entries from directory item number snum[i] */ + leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i]-1, S_new[i]); +#if 0/*preserve list*/ + /* if more than the affected item is shifted, or if more than + one entry (from the affected item) is shifted */ + if (snum[i] > 1 || sbytes[i] > 1) { + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, S_new[i]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); + } +#endif + /* Paste given directory entry to directory item */ + bi.tb = tb; + bi.bi_bh = S_new[i]; + bi.bi_parent = 0; + bi.bi_position = 0; + leaf_paste_in_buffer (&bi, 0, pos_in_item - entry_count + sbytes[i] - 1, + tb->insert_size[0], body,zeros_num); + /* paste new directory entry */ + leaf_paste_entries ( + bi.bi_bh, 0, pos_in_item - entry_count + sbytes[i] - 1, + 1, (struct reiserfs_de_head *)body, body + DEH_SIZE, + tb->insert_size[0] + ); + tb->insert_size[0] = 0; + pos_in_item++; + } else { /* new directory entry doesn't fall into S_new[i] */ + leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]); + } + } + else /* regular object */ + { + int n_shift, n_rem, r_zeros_number; + const char * r_body; + +#ifdef CONFIG_REISERFS_CHECK + if ( pos_in_item != B_N_PITEM_HEAD(tbS0,item_pos)->ih_item_len || + tb->insert_size[0] <= 0 ) + reiserfs_panic (tb->tb_sb, "PAP-12225: balance_leaf: item too short or insert_size <= 0"); +#endif + + /* Calculate number of bytes which must be shifted from appended item */ + n_shift = sbytes[i] - tb->insert_size[0]; + if ( n_shift < 0 ) + n_shift = 0; + leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], n_shift, S_new[i]); + + /* Calculate number of bytes which must remain in body after append to S_new[i] */ + n_rem = tb->insert_size[0] - sbytes[i]; + if ( n_rem < 0 ) + n_rem = 0; + /* Append part of body into S_new[0] */ + bi.tb = tb; + bi.bi_bh = S_new[i]; + bi.bi_parent = 0; + bi.bi_position = 0; + + if ( n_rem > zeros_num ) { + r_zeros_number = 0; + r_body = body + n_rem - zeros_num; + } + else { + r_body = body; + r_zeros_number = zeros_num - n_rem; + zeros_num -= r_zeros_number; + } + + leaf_paste_in_buffer(&bi, 0, n_shift, tb->insert_size[0]-n_rem, r_body,r_zeros_number); + { + struct item_head * tmp; + + tmp = B_N_PITEM_HEAD(S_new[i],0); + if (is_indirect_le_ih (tmp)) { + if (n_rem) + reiserfs_panic (tb->tb_sb, "PAP-12230: balance_leaf: invalid action with indirect item"); + set_ih_free_space (tmp, ((struct unfm_nodeinfo*)body)->unfm_freespace); + } + set_le_key_k_offset (ih_version (tmp), &tmp->ih_key, + le_key_k_offset (ih_version (tmp), &tmp->ih_key) + n_rem); + } + + tb->insert_size[0] = n_rem; + if ( ! n_rem ) + pos_in_item++; + } + } + else + /* item falls wholly into S_new[i] */ + { + int ret_val; + struct item_head * pasted; + +#ifdef CONFIG_REISERFS_CHECK + struct item_head * ih = B_N_PITEM_HEAD(tbS0,item_pos); + + if ( ! is_direntry_le_ih(ih) && (pos_in_item != ih->ih_item_len || + tb->insert_size[0] <= 0) ) + reiserfs_panic (tb->tb_sb, "PAP-12235: balance_leaf: pos_in_item must be equal to ih_item_len"); +#endif /* CONFIG_REISERFS_CHECK */ + + ret_val = leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]); +#if 0/*preserve list*/ + /* we must preserve that which we are pasting onto the end of and shifting */ + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, S_new[i]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); +#endif + +#ifdef CONFIG_REISERFS_CHECK + if ( ret_val ) + reiserfs_panic (tb->tb_sb, "PAP-12240: balance_leaf: " + "unexpected value returned by leaf_move_items (%d)", + ret_val); +#endif /* CONFIG_REISERFS_CHECK */ + + /* paste into item */ + bi.tb = tb; + bi.bi_bh = S_new[i]; + bi.bi_parent = 0; + bi.bi_position = 0; + leaf_paste_in_buffer(&bi, item_pos - n + snum[i], pos_in_item, tb->insert_size[0], body, zeros_num); + + pasted = B_N_PITEM_HEAD(S_new[i], item_pos - n + snum[i]); + if (is_direntry_le_ih (pasted)) + { + leaf_paste_entries ( + bi.bi_bh, item_pos - n + snum[i], pos_in_item, 1, + (struct reiserfs_de_head *)body, body + DEH_SIZE, tb->insert_size[0] + ); + } + + /* if we paste to indirect item update ih_free_space */ + if (is_indirect_le_ih (pasted)) + set_ih_free_space (pasted, ((struct unfm_nodeinfo*)body)->unfm_freespace); + zeros_num = tb->insert_size[0] = 0; + } + } + + else /* pasted item doesn't fall into S_new[i] */ + { + leaf_move_items (LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]); +#if 0/*preserve list*/ + preserve_shifted(tb, &(PATH_PLAST_BUFFER (tb->tb_path)), tbF0, S0_b_item_order, S_new[i]); + tbS0 = PATH_PLAST_BUFFER (tb->tb_path); +#endif + } + break; + default: /* cases d and t */ + reiserfs_panic (tb->tb_sb, "PAP-12245: balance_leaf: blknum > 2: unexpectable mode: %s(%d)", + (flag == M_DELETE) ? "DELETE" : ((flag == M_CUT) ? "CUT" : "UNKNOWN"), flag); + } + + memcpy (insert_key + i,B_N_PKEY(S_new[i],0),KEY_SIZE); + insert_ptr[i] = S_new[i]; + +#ifdef CONFIG_REISERFS_CHECK + if (atomic_read (&(S_new[i]->b_count)) != 1) { + if (atomic_read(&(S_new[i]->b_count)) != 2 || + !(buffer_journaled(S_new[i]) || buffer_journal_dirty(S_new[i]))) { + reiserfs_panic (tb->tb_sb, "PAP-12247: balance_leaf: S_new[%d] : (%b)\n", i, S_new[i]); + } + } +#endif + +#if 0 + /* update right_delimiting_key fields */ + copy_key (B_PRIGHT_DELIM_KEY (S_new[i]), B_PRIGHT_DELIM_KEY (tbS0)); + copy_key (B_PRIGHT_DELIM_KEY (tbS0), B_N_PKEY (S_new[i], 0)); + reiserfs_mark_buffer_dirty (tbS0, 0); +#endif + + } + + /* if the affected item was not wholly shifted then we perform all necessary operations on that part or whole of the + affected item which remains in S */ + if ( 0 <= item_pos && item_pos < tb->s0num ) + { /* if we must insert or append into buffer S[0] */ + + switch (flag) + { + case M_INSERT: /* insert item into S[0] */ + bi.tb = tb; + bi.bi_bh = tbS0; + bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); + bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); + leaf_insert_into_buf (&bi, item_pos, ih, body, zeros_num); +#if 0/*preserve list*/ + if (tb->preserve_mode == PRESERVE_INDIRECT_TO_DIRECT){ + mark_suspected_recipient (tb->tb_sb, bi.bi_bh); + } +#endif + + /* If we insert the first key change the delimiting key */ + if( item_pos == 0 ) { + if (tb->CFL[0]) /* can be 0 in reiserfsck */ + replace_key(tb, tb->CFL[0], tb->lkey[0],tbS0,0); + +#if 0 /* right delim key support */ +#ifdef CONFIG_REISERFS_CHECK + if ( ! tb->CFL[0] || ! tb->L[0] || (B_NR_ITEMS (tbS0) > 1 && + COMP_KEYS(B_PRIGHT_DELIM_KEY(tb->L[0]), B_N_PKEY(tbS0, 1))) ) + reiserfs_panic(tb->tb_sb, "PAP-12250: balance_leaf: invalid right delimiting key"); + if (!buffer_dirty (tb->L[0]) && !(buffer_journaled(tb->L[0]) || + buffer_journal_dirty(tb->L[0]))) + reiserfs_panic (tb->tb_sb, "PAP-12255: balance_leaf: tb->L[0] must be dirty"); +#endif + if (tb->L[0]) /* can be 0 in reiserfsck */ + copy_key (B_PRIGHT_DELIM_KEY (tb->L[0]), &(ih->ih_key)); +#endif /* right delim key support */ + } + break; + + case M_PASTE: { /* append item in S[0] */ + struct item_head * pasted; + + pasted = B_N_PITEM_HEAD (tbS0, item_pos); + /* when directory, may be new entry already pasted */ + if (is_direntry_le_ih (pasted)) { + if ( pos_in_item >= 0 && pos_in_item <= le16_to_cpu (pasted->u.ih_entry_count) ) { + +#ifdef CONFIG_REISERFS_CHECK + if ( ! tb->insert_size[0] ) + reiserfs_panic (tb->tb_sb, "PAP-12260: balance_leaf: insert_size is 0 already"); +#endif /* CONFIG_REISERFS_CHECK */ + + /* prepare space */ + bi.tb = tb; + bi.bi_bh = tbS0; + bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); + bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); + leaf_paste_in_buffer(&bi, item_pos, pos_in_item, tb->insert_size[0], body, zeros_num); + + +#ifdef CONFIG_REISERFS_CHECK +#if 0 + if ( ! item_pos && ! pos_in_item && (! tb->L[0] || COMP_KEYS(B_PRIGHT_DELIM_KEY(tb->L[0]), + B_N_PKEY(tbS0, 0))) ) + reiserfs_panic(tb->tb_sb, "PAP-12265: balance_leaf: invalid right delimiting key"); +#endif +#endif + + /* paste entry */ + leaf_paste_entries ( + bi.bi_bh, item_pos, pos_in_item, 1, (struct reiserfs_de_head *)body, + body + DEH_SIZE, tb->insert_size[0] + ); + if ( ! item_pos && ! pos_in_item ) { + +#ifdef CONFIG_REISERFS_CHECK + if (!tb->CFL[0] || !tb->L[0]) + reiserfs_panic (tb->tb_sb, "PAP-12270: balance_leaf: CFL[0]/L[0] must be specified"); +#endif /* CONFIG_REISERFS_CHECK */ + + if (tb->CFL[0]) { + replace_key(tb, tb->CFL[0], tb->lkey[0],tbS0,0); + +#if 0 + /* update right delimiting key */ + copy_key (B_PRIGHT_DELIM_KEY (tb->L[0]), B_N_PKEY(tbS0, 0)); + /* probably not needed as something has been shifted to tb->L[0] already */ + reiserfs_mark_buffer_dirty (tb->L[0], 0); +#endif + } + } + tb->insert_size[0] = 0; + } + } else { /* regular object */ + if ( pos_in_item == pasted->ih_item_len ) { + +#ifdef CONFIG_REISERFS_CHECK + if ( tb->insert_size[0] <= 0 ) + reiserfs_panic (tb->tb_sb, + "PAP-12275: balance_leaf: insert size must not be %d", tb->insert_size[0]); +#endif /* CONFIG_REISERFS_CHECK */ + bi.tb = tb; + bi.bi_bh = tbS0; + bi.bi_parent = PATH_H_PPARENT (tb->tb_path, 0); + bi.bi_position = PATH_H_POSITION (tb->tb_path, 1); + leaf_paste_in_buffer (&bi, item_pos, pos_in_item, tb->insert_size[0], body, zeros_num); + + if (is_indirect_le_ih (pasted)) { + +#ifdef CONFIG_REISERFS_CHECK + if ( tb->insert_size[0] != UNFM_P_SIZE ) + reiserfs_panic (tb->tb_sb, + "PAP-12280: balance_leaf: insert_size for indirect item must be %d, not %d", + UNFM_P_SIZE, tb->insert_size[0]); +#endif /* CONFIG_REISERFS_CHECK */ + + set_ih_free_space (pasted, ((struct unfm_nodeinfo*)body)->unfm_freespace); + } + tb->insert_size[0] = 0; + } + +#ifdef CONFIG_REISERFS_CHECK + else { + if ( tb->insert_size[0] ) { + print_cur_tb ("12285"); + reiserfs_panic (tb->tb_sb, "PAP-12285: balance_leaf: insert_size must be 0 (%d)", tb->insert_size[0]); + } + } +#endif /* CONFIG_REISERFS_CHECK */ + + } + } /* case M_PASTE: */ + } + } + +#ifdef CONFIG_REISERFS_CHECK + if ( flag == M_PASTE && tb->insert_size[0] ) { + print_cur_tb ("12290"); + reiserfs_panic (tb->tb_sb, "PAP-12290: balance_leaf: insert_size is still not 0 (%d)", tb->insert_size[0]); + } +#endif /* CONFIG_REISERFS_CHECK */ + + return 0; +} /* Leaf level of the tree is balanced (end of balance_leaf) */ + + + +/* Make empty node */ +void make_empty_node (struct buffer_info * bi) +{ + struct block_head * blkh; + +#ifdef CONFIG_REISERFS_CHECK + if (bi->bi_bh == NULL) + reiserfs_panic (0, "PAP-12295: make_empty_node: pointer to the buffer is NULL"); +#endif + + (blkh = B_BLK_HEAD(bi->bi_bh))->blk_nr_item = cpu_to_le16 (0); + blkh->blk_free_space = cpu_to_le16 (MAX_CHILD_SIZE(bi->bi_bh)); + + if (bi->bi_parent) + B_N_CHILD (bi->bi_parent, bi->bi_position)->dc_size = 0; +} + + +/* Get first empty buffer */ +struct buffer_head * get_FEB (struct tree_balance * tb) +{ + int i; + struct buffer_head * first_b; + struct buffer_info bi; + + for (i = 0; i < MAX_FEB_SIZE; i ++) + if (tb->FEB[i] != 0) + break; + + if (i == MAX_FEB_SIZE) + reiserfs_panic(tb->tb_sb, "vs-12300: get_FEB: FEB list is empty"); + + bi.tb = tb; + bi.bi_bh = first_b = tb->FEB[i]; + bi.bi_parent = 0; + bi.bi_position = 0; + make_empty_node (&bi); + set_bit(BH_Uptodate, &first_b->b_state); + tb->FEB[i] = 0; + tb->used[i] = first_b; + +#ifdef REISERFS_FSCK + mark_block_formatted (first_b->b_blocknr); +#endif + + return(first_b); +} + + +/* This is now used because reiserfs_free_block has to be able to +** schedule. +*/ +static void store_thrown (struct tree_balance * tb, struct buffer_head * bh) +{ + int i; + + if (buffer_dirty (bh)) + printk ("store_thrown deals with dirty buffer\n"); + for (i = 0; i < sizeof (tb->thrown)/sizeof (tb->thrown[0]); i ++) + if (!tb->thrown[i]) { + tb->thrown[i] = bh; + atomic_inc(&bh->b_count) ; /* decremented in free_thrown */ + return; + } + reiserfs_warning ("store_thrown: too many thrown buffers\n"); +} + +static void free_thrown(struct tree_balance *tb) { + int i ; + unsigned long blocknr ; + for (i = 0; i < sizeof (tb->thrown)/sizeof (tb->thrown[0]); i++) { + if (tb->thrown[i]) { + blocknr = tb->thrown[i]->b_blocknr ; + if (buffer_dirty (tb->thrown[i])) + printk ("free_thrown deals with dirty buffer %ld\n", blocknr); + brelse(tb->thrown[i]) ; /* incremented in store_thrown */ + reiserfs_free_block (tb->transaction_handle, blocknr); + } + } +} + +void reiserfs_invalidate_buffer (struct tree_balance * tb, struct buffer_head * bh) +{ + B_BLK_HEAD (bh)->blk_level = cpu_to_le16 (FREE_LEVEL)/*0*/; + B_BLK_HEAD (bh)->blk_nr_item = cpu_to_le16 (0); + mark_buffer_clean (bh); + /* reiserfs_free_block is no longer schedule safe + reiserfs_free_block (tb->transaction_handle, tb->tb_sb, bh->b_blocknr); + */ + + store_thrown (tb, bh); +#if 0 +#ifdef REISERFS_FSCK + { + struct buffer_head * to_be_forgotten; + + to_be_forgotten = find_buffer (bh->b_dev, bh->b_blocknr, bh->b_size); + if (to_be_forgotten) { + to_be_forgotten->b_count ++; + bforget (to_be_forgotten); + } + unmark_block_formatted (bh->b_blocknr); + } +#endif +#endif +} + +/* Replace n_dest'th key in buffer dest by n_src'th key of buffer src.*/ +void replace_key (struct tree_balance * tb, struct buffer_head * dest, int n_dest, + struct buffer_head * src, int n_src) +{ + +#ifdef CONFIG_REISERFS_CHECK + if (dest == NULL || src == NULL) + reiserfs_panic (0, "vs-12305: replace_key: sourse or destination buffer is 0 (src=%p, dest=%p)", src, dest); + + if ( ! B_IS_KEYS_LEVEL (dest) ) + reiserfs_panic (0, "vs-12310: replace_key: invalid level (%z) for destination buffer. dest must be leaf", + dest); + + if (n_dest < 0 || n_src < 0) + reiserfs_panic (0, "vs-12315: replace_key: src(%d) or dest(%d) key number less than 0", n_src, n_dest); + + if (n_dest >= B_NR_ITEMS(dest) || n_src >= B_NR_ITEMS(src)) + reiserfs_panic (0, "vs-12320: replace_key: src(%d(%d)) or dest(%d(%d)) key number is too big", + n_src, B_NR_ITEMS(src), n_dest, B_NR_ITEMS(dest)); +#endif /* CONFIG_REISERFS_CHECK */ + + if (B_IS_ITEMS_LEVEL (src)) + /* source buffer contains leaf node */ + memcpy (B_N_PDELIM_KEY(dest,n_dest), B_N_PITEM_HEAD(src,n_src), KEY_SIZE); + else + memcpy (B_N_PDELIM_KEY(dest,n_dest), B_N_PDELIM_KEY(src,n_src), KEY_SIZE); + + do_balance_mark_internal_dirty (tb, dest, 0); +} + + +int get_left_neighbor_position ( + struct tree_balance * tb, + int h + ) +{ + int Sh_position = PATH_H_POSITION (tb->tb_path, h + 1); + +#ifdef CONFIG_REISERFS_CHECK + if (PATH_H_PPARENT (tb->tb_path, h) == 0 || tb->FL[h] == 0) + reiserfs_panic (tb->tb_sb, "vs-12325: get_left_neighbor_position: FL[%d](%p) or F[%d](%p) does not exist", + h, tb->FL[h], h, PATH_H_PPARENT (tb->tb_path, h)); +#endif + + if (Sh_position == 0) + return B_NR_ITEMS (tb->FL[h]); + else + return Sh_position - 1; +} + + +int get_right_neighbor_position (struct tree_balance * tb, int h) +{ + int Sh_position = PATH_H_POSITION (tb->tb_path, h + 1); + +#ifdef CONFIG_REISERFS_CHECK + if (PATH_H_PPARENT (tb->tb_path, h) == 0 || tb->FR[h] == 0) + reiserfs_panic (tb->tb_sb, "vs-12330: get_right_neighbor_position: F[%d](%p) or FR[%d](%p) does not exist", + h, PATH_H_PPARENT (tb->tb_path, h), h, tb->FR[h]); +#endif + + if (Sh_position == B_NR_ITEMS (PATH_H_PPARENT (tb->tb_path, h))) + return 0; + else + return Sh_position + 1; +} + + +#ifdef CONFIG_REISERFS_CHECK + +int is_reusable (struct super_block * s, unsigned long block, int bit_value); +static void check_internal_node (struct super_block * s, struct buffer_head * bh, char * mes) +{ + struct disk_child * dc; + int i; + + if (!bh) + reiserfs_panic (s, "PAP-12336: check_internal_node: bh == 0"); + + if (!bh || !B_IS_IN_TREE (bh)) + return; + + if (!buffer_dirty (bh) && + !(buffer_journaled(bh) || buffer_journal_dirty(bh))) { + reiserfs_panic (s, "PAP-12337: check_internal_node: buffer (%b) must be dirty", bh); + } + + dc = B_N_CHILD (bh, 0); + + for (i = 0; i <= B_NR_ITEMS (bh); i ++, dc ++) { + if (!is_reusable (s, dc->dc_block_number, 1) ) { + print_cur_tb (mes); + reiserfs_panic (s, "PAP-12338: check_internal_node: invalid child pointer %y in %b", dc, bh); + } + } +} + + +static int locked_or_not_in_tree (struct buffer_head * bh, char * which) +{ + if ( buffer_locked (bh) || !B_IS_IN_TREE (bh) ) { + reiserfs_warning ("vs-12339: locked_or_not_in_tree: %s (%b)\n", which, bh); + return 1; + } + return 0; +} + + +static int check_before_balancing (struct tree_balance * tb) +{ + int retval = 0; + + if ( cur_tb ) { + reiserfs_panic (tb->tb_sb, "vs-12335: check_before_balancing: " + "suspect that schedule occurred based on cur_tb not being null at this point in code. " + "do_balance cannot properly handle schedule occuring while it runs."); + } + + /* double check that buffers that we will modify are unlocked. (fix_nodes should already have + prepped all of these for us). */ + if ( tb->lnum[0] ) { + retval |= locked_or_not_in_tree (tb->L[0], "L[0]"); + retval |= locked_or_not_in_tree (tb->FL[0], "FL[0]"); + retval |= locked_or_not_in_tree (tb->CFL[0], "CFL[0]"); + check_leaf (tb->L[0]); + } + if ( tb->rnum[0] ) { + retval |= locked_or_not_in_tree (tb->R[0], "R[0]"); + retval |= locked_or_not_in_tree (tb->FR[0], "FR[0]"); + retval |= locked_or_not_in_tree (tb->CFR[0], "CFR[0]"); + check_leaf (tb->R[0]); + } + retval |= locked_or_not_in_tree (PATH_PLAST_BUFFER (tb->tb_path), "S[0]"); + check_leaf (PATH_PLAST_BUFFER (tb->tb_path)); + + return retval; +} + + +void check_after_balance_leaf (struct tree_balance * tb) +{ + if (tb->lnum[0]) { + if (B_FREE_SPACE (tb->L[0]) != + MAX_CHILD_SIZE (tb->L[0]) - B_N_CHILD (tb->FL[0], get_left_neighbor_position (tb, 0))->dc_size) { + print_cur_tb ("12221"); + reiserfs_panic (tb->tb_sb, "PAP-12355: check_after_balance_leaf: shift to left was incorrect"); + } + } + if (tb->rnum[0]) { + if (B_FREE_SPACE (tb->R[0]) != + MAX_CHILD_SIZE (tb->R[0]) - B_N_CHILD (tb->FR[0], get_right_neighbor_position (tb, 0))->dc_size) { + print_cur_tb ("12222"); + reiserfs_panic (tb->tb_sb, "PAP-12360: check_after_balance_leaf: shift to right was incorrect"); + } + } + if (PATH_H_PBUFFER(tb->tb_path,1) && (B_FREE_SPACE (PATH_H_PBUFFER(tb->tb_path,0)) != + (MAX_CHILD_SIZE (PATH_H_PBUFFER(tb->tb_path,0)) - + B_N_CHILD (PATH_H_PBUFFER(tb->tb_path,1), + PATH_H_POSITION (tb->tb_path, 1))->dc_size))) { + print_cur_tb ("12223"); + reiserfs_panic (tb->tb_sb, "PAP-12365: check_after_balance_leaf: S is incorrect"); + } +} + + +void check_leaf_level (struct tree_balance * tb) +{ + check_leaf (tb->L[0]); + check_leaf (tb->R[0]); + check_leaf (PATH_PLAST_BUFFER (tb->tb_path)); +} + +void check_internal_levels (struct tree_balance * tb) +{ + int h; + + /* check all internal nodes */ + for (h = 1; tb->insert_size[h]; h ++) { + check_internal_node (tb->tb_sb, PATH_H_PBUFFER (tb->tb_path, h), "BAD BUFFER ON PATH"); + if (tb->lnum[h]) + check_internal_node (tb->tb_sb, tb->L[h], "BAD L"); + if (tb->rnum[h]) + check_internal_node (tb->tb_sb, tb->R[h], "BAD R"); + } + +} + +#endif + + + + + + +/* Now we have all of the buffers that must be used in balancing of + the tree. We rely on the assumption that schedule() will not occur + while do_balance works. ( Only interrupt handlers are acceptable.) + We balance the tree according to the analysis made before this, + using buffers already obtained. For SMP support it will someday be + necessary to add ordered locking of tb. */ + +/* Some interesting rules of balancing: + + we delete a maximum of two nodes per level per balancing: we never + delete R, when we delete two of three nodes L, S, R then we move + them into R. + + we only delete L if we are deleting two nodes, if we delete only + one node we delete S + + if we shift leaves then we shift as much as we can: this is a + deliberate policy of extremism in node packing which results in + higher average utilization after repeated random balance operations + at the cost of more memory copies and more balancing as a result of + small insertions to full nodes. + + if we shift internal nodes we try to evenly balance the node + utilization, with consequent less balancing at the cost of lower + utilization. + + one could argue that the policy for directories in leaves should be + that of internal nodes, but we will wait until another day to + evaluate this.... It would be nice to someday measure and prove + these assumptions as to what is optimal.... + +*/ + +static inline void do_balance_starts (struct tree_balance *tb) +{ + /* use print_cur_tb() to see initial state of struct + tree_balance */ + + /* store_print_tb (tb); */ + +#ifdef CONFIG_REISERFS_CHECK + + /* do not delete, just comment it out */ +/* print_tb(flag, PATH_LAST_POSITION(tb->tb_path), tb->tb_path->pos_in_item, tb, + "check");*/ + + if (check_before_balancing (tb)) + reiserfs_panic (tb->tb_sb, "PAP-12340: do_balance: locked buffers in TB"); + +#ifndef __KERNEL__ + if ( atomic_read(&(PATH_PLAST_BUFFER(tb->tb_path)->b_count)) > 1 || (tb->L[0] && atomic_read(&(tb->L[0]->b_count)) > 1) || + (tb->R[0] && atomic_read(&(tb->R[0]->b_count)) > 1) ) { + print_cur_tb ("first three parameters are invalid"); + reiserfs_panic (tb->tb_sb, "PAP-12345: do_balance: counter too big"); + } +#endif /* !__KERNEL__ */ + cur_tb = tb; + +#endif /* CONFIG_REISERFS_CHECK */ +} + + +static inline void do_balance_completed (struct tree_balance * tb) +{ + +#ifdef CONFIG_REISERFS_CHECK + check_leaf_level (tb); + check_internal_levels (tb); + cur_tb = NULL; +#endif + + /* reiserfs_free_block is no longer schedule safe. So, we need to + ** put the buffers we want freed on the thrown list during do_balance, + ** and then free them now + */ + + tb->tb_sb->u.reiserfs_sb.s_do_balance ++; + + + /* release all nodes hold to perform the balancing */ + unfix_nodes(tb); + + free_thrown(tb) ; +} + + + + + +void do_balance (struct tree_balance * tb, /* tree_balance structure */ + struct item_head * ih, /* item header of inserted item */ + const char * body, /* body of inserted item or bytes to paste */ + int flag) /* i - insert, d - delete + c - cut, p - paste + + Cut means delete part of an item + (includes removing an entry from a + directory). + + Delete means delete whole item. + + Insert means add a new item into the + tree. + + Paste means to append to the end of an + existing file or to insert a directory + entry. */ +{ + int child_pos, /* position of a child node in its parent */ + h; /* level of the tree being processed */ + struct item_head insert_key[2]; /* in our processing of one level + we sometimes determine what + must be inserted into the next + higher level. This insertion + consists of a key or two keys + and their corresponding + pointers */ + struct buffer_head *insert_ptr[2]; /* inserted node-ptrs for the next + level */ + + tb->tb_mode = flag; + tb->need_balance_dirty = 0; + + if (FILESYSTEM_CHANGED_TB(tb)) { + reiserfs_panic(tb->tb_sb, "clm-6000: do_balance, fs generation has changed\n") ; + } + /* if we have no real work to do */ + if ( ! tb->insert_size[0] ) { + reiserfs_warning ("PAP-12350: do_balance: insert_size == 0, mode == %c", + flag); + unfix_nodes(tb); + return; + } + + atomic_inc (&(fs_generation (tb->tb_sb))); + do_balance_starts (tb); + +#ifdef REISERFS_FSCK + if (flag == M_INTERNAL) { + insert_ptr[0] = (struct buffer_head *)body; + /* we must prepare insert_key */ + + if (PATH_H_B_ITEM_ORDER (tb->tb_path, 0)/*LAST_POSITION (tb->tb_path)*//*item_pos*/ == -1) { + /* get delimiting key from buffer in tree */ + copy_key (&insert_key[0].ih_key, B_N_PKEY (PATH_PLAST_BUFFER (tb->tb_path), 0)); + /*insert_ptr[0]->b_item_order = 0;*/ + } else { + /* get delimiting key from new buffer */ + copy_key (&insert_key[0].ih_key, B_N_PKEY((struct buffer_head *)body,0)); + /*insert_ptr[0]->b_item_order = item_pos;*/ + } + + /* and insert_ptr instead of balance_leaf */ + child_pos = PATH_H_B_ITEM_ORDER (tb->tb_path, 0)/*item_pos*/; + } else +#endif + + /* balance leaf returns 0 except if combining L R and S into + one node. see balance_internal() for explanation of this + line of code.*/ + child_pos = PATH_H_B_ITEM_ORDER (tb->tb_path, 0) + + balance_leaf (tb, ih, body, flag, insert_key, insert_ptr); + +#ifdef CONFIG_REISERFS_CHECK + check_after_balance_leaf (tb); +#endif + + /* Balance internal level of the tree. */ + for ( h = 1; h < MAX_HEIGHT && tb->insert_size[h]; h++ ) + child_pos = balance_internal (tb, h, child_pos, insert_key, insert_ptr); + + + do_balance_completed (tb); + +} diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/file.c linux/fs/reiserfs/file.c --- v2.4.0/linux/fs/reiserfs/file.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/file.c Mon Jan 15 13:23:01 2001 @@ -0,0 +1,124 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + + +#ifdef __KERNEL__ + +#include +#include +#include + +#else + +#include "nokernel.h" + +#endif + +/* +** We pack the tails of files on file close, not at the time they are written. +** This implies an unnecessary copy of the tail and an unnecessary indirect item +** insertion/balancing, for files that are written in one write. +** It avoids unnecessary tail packings (balances) for files that are written in +** multiple writes and are small enough to have tails. +** +** file_release is called by the VFS layer when the file is closed. If +** this is the last open file descriptor, and the file +** small enough to have a tail, and the tail is currently in an +** unformatted node, the tail is converted back into a direct item. +** +** We use reiserfs_truncate_file to pack the tail, since it already has +** all the conditions coded. +*/ +static int reiserfs_file_release (struct inode * inode, struct file * filp) +{ + + struct reiserfs_transaction_handle th ; + int windex ; + + if (!S_ISREG (inode->i_mode)) + BUG (); + + /* fast out for when nothing needs to be done */ + if ((atomic_read(&inode->i_count) > 1 || + !inode->u.reiserfs_i.i_pack_on_close || + !tail_has_to_be_packed(inode)) && + inode->u.reiserfs_i.i_prealloc_count <= 0) { + return 0; + } + + lock_kernel() ; + down (&inode->i_sem); + journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3) ; + +#ifdef REISERFS_PREALLOCATE + reiserfs_discard_prealloc (&th, inode); +#endif + journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3) ; + + if (atomic_read(&inode->i_count) <= 1 && + inode->u.reiserfs_i.i_pack_on_close && + tail_has_to_be_packed (inode)) { + /* if regular file is released by last holder and it has been + appended (we append by unformatted node only) or its direct + item(s) had to be converted, then it may have to be + indirect2direct converted */ + windex = push_journal_writer("file_release") ; + reiserfs_truncate_file(inode, 0) ; + pop_journal_writer(windex) ; + } + up (&inode->i_sem); + unlock_kernel() ; + return 0; +} + +static void reiserfs_vfs_truncate_file(struct inode *inode) { + reiserfs_truncate_file(inode, 1) ; +} + +/* Sync a reiserfs file. */ +static int reiserfs_sync_file( + struct file * p_s_filp, + struct dentry * p_s_dentry, + int datasync + ) { + struct inode * p_s_inode = p_s_dentry->d_inode; + struct reiserfs_transaction_handle th ; + int n_err = 0; + int windex ; + int jbegin_count = 1 ; + + lock_kernel() ; + + if (!S_ISREG(p_s_inode->i_mode)) + BUG (); + + n_err = fsync_inode_buffers(p_s_inode) ; + /* commit the current transaction to flush any metadata + ** changes. sys_fsync takes care of flushing the dirty pages for us + */ + journal_begin(&th, p_s_inode->i_sb, jbegin_count) ; + windex = push_journal_writer("sync_file") ; + reiserfs_update_sd(&th, p_s_inode); + pop_journal_writer(windex) ; + journal_end_sync(&th, p_s_inode->i_sb,jbegin_count) ; + unlock_kernel() ; + return ( n_err < 0 ) ? -EIO : 0; +} + + +struct file_operations reiserfs_file_operations = { + read: generic_file_read, + write: generic_file_write, + ioctl: reiserfs_ioctl, + mmap: generic_file_mmap, + release: reiserfs_file_release, + fsync: reiserfs_sync_file, +}; + + +struct inode_operations reiserfs_file_inode_operations = { + truncate: reiserfs_vfs_truncate_file, +}; + + diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/fix_node.c linux/fs/reiserfs/fix_node.c --- v2.4.0/linux/fs/reiserfs/fix_node.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/fix_node.c Mon Jan 15 15:31:19 2001 @@ -0,0 +1,2908 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/** + ** old_item_num + ** old_entry_num + ** set_entry_sizes + ** create_virtual_node + ** check_left + ** check_right + ** directory_part_size + ** get_num_ver + ** set_parameters + ** is_leaf_removable + ** are_leaves_removable + ** get_empty_nodes + ** get_lfree + ** get_rfree + ** is_left_neighbor_in_cache + ** decrement_key + ** get_far_parent + ** get_parents + ** can_node_be_removed + ** ip_check_balance + ** dc_check_balance_internal + ** dc_check_balance_leaf + ** dc_check_balance + ** check_balance + ** get_direct_parent + ** get_neighbors + ** fix_nodes + ** + ** + **/ + + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include + +#else + +#include "nokernel.h" + +#endif + + + +/* To make any changes in the tree we find a node, that contains item + to be changed/deleted or position in the node we insert a new item + to. We call this node S. To do balancing we need to decide what we + will shift to left/right neighbor, or to a new node, where new item + will be etc. To make this analysis simpler we build virtual + node. Virtual node is an array of items, that will replace items of + node S. (For instance if we are going to delete an item, virtual + node does not contain it). Virtual node keeps information about + item sizes and types, mergeability of first and last items, sizes + of all entries in directory item. We use this array of items when + calculating what we can shift to neighbors and how many nodes we + have to have if we do not any shiftings, if we shift to left/right + neighbor or to both. */ + + +/* taking item number in virtual node, returns number of item, that it has in source buffer */ +static inline int old_item_num (int new_num, int affected_item_num, int mode) +{ + if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num) + return new_num; + + if (mode == M_INSERT) { + +#ifdef CONFIG_REISERFS_CHECK + if (new_num == 0) + reiserfs_panic (0,"vs-8005: old_item_num: for INSERT mode and item number of inserted item"); +#endif + + return new_num - 1; + } + +#ifdef CONFIG_REISERFS_CHECK + if (mode != M_DELETE) + reiserfs_panic (0, "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'", mode); +#endif + + /* delete mode */ + return new_num + 1; +} + +static void create_virtual_node (struct tree_balance * tb, int h) +{ + struct item_head * ih; + struct virtual_node * vn = tb->tb_vn; + int new_num; + struct buffer_head * Sh; /* this comes from tb->S[h] */ + + Sh = PATH_H_PBUFFER (tb->tb_path, h); + + /* size of changed node */ + vn->vn_size = MAX_CHILD_SIZE (Sh) - B_FREE_SPACE (Sh) + tb->insert_size[h]; + + /* for internal nodes array if virtual items is not created */ + if (h) { + vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE); + return; + } + + /* number of items in virtual node */ + vn->vn_nr_item = B_NR_ITEMS (Sh) + ((vn->vn_mode == M_INSERT)? 1 : 0) - ((vn->vn_mode == M_DELETE)? 1 : 0); + + /* first virtual item */ + vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1); + memset (vn->vn_vi, 0, vn->vn_nr_item * sizeof (struct virtual_item)); + vn->vn_free_ptr += vn->vn_nr_item * sizeof (struct virtual_item); + + + /* first item in the node */ + ih = B_N_PITEM_HEAD (Sh, 0); + + /* define the mergeability for 0-th item (if it is not being deleted) */ +#ifdef REISERFS_FSCK + if (is_left_mergeable (tb->tb_sb, tb->tb_path) == 1 && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num)) +#else + if (op_is_left_mergeable (&(ih->ih_key), Sh->b_size) && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num)) +#endif + vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE; + + /* go through all items those remain in the virtual node (except for the new (inserted) one) */ + for (new_num = 0; new_num < vn->vn_nr_item; new_num ++) { + int j; + struct virtual_item * vi = vn->vn_vi + new_num; + int is_affected = ((new_num != vn->vn_affected_item_num) ? 0 : 1); + + + if (is_affected && vn->vn_mode == M_INSERT) + continue; + + /* get item number in source node */ + j = old_item_num (new_num, vn->vn_affected_item_num, vn->vn_mode); + + vi->vi_item_len += ih[j].ih_item_len + IH_SIZE; + vi->vi_ih = ih + j; + vi->vi_item = B_I_PITEM (Sh, ih + j); + vi->vi_uarea = vn->vn_free_ptr; + + // FIXME: there is no check, that item operation did not + // consume too much memory + vn->vn_free_ptr += op_create_vi (vn, vi, is_affected, tb->insert_size [0]); + if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr) + reiserfs_panic (tb->tb_sb, "vs-8030: create_virtual_node: " + "virtual node space consumed"); + + if (!is_affected) + /* this is not being changed */ + continue; + + if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) { + vn->vn_vi[new_num].vi_item_len += tb->insert_size[0]; + vi->vi_new_data = vn->vn_data; // pointer to data which is going to be pasted + } + } + + + /* virtual inserted item is not defined yet */ + if (vn->vn_mode == M_INSERT) { + struct virtual_item * vi = vn->vn_vi + vn->vn_affected_item_num; + +#ifdef CONFIG_REISERFS_CHECK + if (vn->vn_ins_ih == 0) + reiserfs_panic (0, "vs-8040: create_virtual_node: item header of inserted item is not specified"); +#endif + + vi->vi_item_len = tb->insert_size[0]; + vi->vi_ih = vn->vn_ins_ih; + vi->vi_item = vn->vn_data; + vi->vi_uarea = vn->vn_free_ptr; + + op_create_vi (vn, vi, 0/*not pasted or cut*/, tb->insert_size [0]); +#if 0 + switch (type/*le_key_k_type (ih_version (vn->vn_ins_ih), &(vn->vn_ins_ih->ih_key))*/) { + case TYPE_STAT_DATA: + vn->vn_vi[vn->vn_affected_item_num].vi_type |= VI_TYPE_STAT_DATA; + break; + case TYPE_DIRECT: + vn->vn_vi[vn->vn_affected_item_num].vi_type |= VI_TYPE_DIRECT; + break; + case TYPE_INDIRECT: + vn->vn_vi[vn->vn_affected_item_num].vi_type |= VI_TYPE_INDIRECT; + break; + default: + /* inseted item is directory (it must be item with "." and "..") */ + vn->vn_vi[vn->vn_affected_item_num].vi_type |= + (VI_TYPE_DIRECTORY | VI_TYPE_FIRST_DIRECTORY_ITEM | VI_TYPE_INSERTED_DIRECTORY_ITEM); + + /* this directory item can not be split, so do not set sizes of entries */ + break; + } +#endif + } + + /* set right merge flag we take right delimiting key and check whether it is a mergeable item */ + if (tb->CFR[0]) { + struct key * key; + + key = B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]); +#ifdef REISERFS_FSCK + if (is_right_mergeable (tb->tb_sb, tb->tb_path) == 1 && (vn->vn_mode != M_DELETE || + vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1)) +#else + if (op_is_left_mergeable (key, Sh->b_size) && (vn->vn_mode != M_DELETE || + vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1)) +#endif + vn->vn_vi[vn->vn_nr_item-1].vi_type |= VI_TYPE_RIGHT_MERGEABLE; + +#ifdef CONFIG_REISERFS_CHECK + if (op_is_left_mergeable (key, Sh->b_size) && + !(vn->vn_mode != M_DELETE || vn->vn_affected_item_num != B_NR_ITEMS (Sh) - 1) ) { + /* we delete last item and it could be merged with right neighbor's first item */ + if (!(B_NR_ITEMS (Sh) == 1 && is_direntry_le_ih (B_N_PITEM_HEAD (Sh, 0)) && + I_ENTRY_COUNT (B_N_PITEM_HEAD (Sh, 0)) == 1)) { + /* node contains more than 1 item, or item is not directory item, or this item contains more than 1 entry */ + print_block (Sh, 0, -1, -1); + reiserfs_panic (tb->tb_sb, "vs-8045: create_virtual_node: rdkey %k, affected item==%d (mode==%c) Must be %c", + key, vn->vn_affected_item_num, vn->vn_mode, M_DELETE); + } else + /* we can delete directory item, that has only one directory entry in it */ + ; + } +#endif + + } +} + + +/* using virtual node check, how many items can be shifted to left + neighbor */ +static void check_left (struct tree_balance * tb, int h, int cur_free) +{ + int i; + struct virtual_node * vn = tb->tb_vn; + struct virtual_item * vi; + int d_size, ih_size; + +#ifdef CONFIG_REISERFS_CHECK + if (cur_free < 0) + reiserfs_panic (0, "vs-8050: check_left: cur_free (%d) < 0", cur_free); +#endif + + /* internal level */ + if (h > 0) { + tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE); + return; + } + + /* leaf level */ + + if (!cur_free || !vn->vn_nr_item) { + /* no free space or nothing to move */ + tb->lnum[h] = 0; + tb->lbytes = -1; + return; + } + +#ifdef CONFIG_REISERFS_CHECK + if (!PATH_H_PPARENT (tb->tb_path, 0)) + reiserfs_panic (0, "vs-8055: check_left: parent does not exist or invalid"); +#endif + + vi = vn->vn_vi; + if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) { + /* all contents of S[0] fits into L[0] */ + +#ifdef CONFIG_REISERFS_CHECK + if (vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE) + reiserfs_panic (0, "vs-8055: check_left: invalid mode or balance condition failed"); +#endif + + tb->lnum[0] = vn->vn_nr_item; + tb->lbytes = -1; + return; + } + + + d_size = 0, ih_size = IH_SIZE; + + /* first item may be merge with last item in left neighbor */ + if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE) + d_size = -((int)IH_SIZE), ih_size = 0; + + tb->lnum[0] = 0; + for (i = 0; i < vn->vn_nr_item; i ++, ih_size = IH_SIZE, d_size = 0, vi ++) { + d_size += vi->vi_item_len; + if (cur_free >= d_size) { + /* the item can be shifted entirely */ + cur_free -= d_size; + tb->lnum[0] ++; + continue; + } + + /* the item cannot be shifted entirely, try to split it */ + /* check whether L[0] can hold ih and at least one byte of the item body */ + if (cur_free <= ih_size) { + /* cannot shift even a part of the current item */ + tb->lbytes = -1; + return; + } + cur_free -= ih_size; + + tb->lbytes = op_check_left (vi, cur_free, 0, 0); + if (tb->lbytes != -1) + /* count partially shifted item */ + tb->lnum[0] ++; + + break; + } + + return; +} + + +/* using virtual node check, how many items can be shifted to right + neighbor */ +static void check_right (struct tree_balance * tb, int h, int cur_free) +{ + int i; + struct virtual_node * vn = tb->tb_vn; + struct virtual_item * vi; + int d_size, ih_size; + +#ifdef CONFIG_REISERFS_CHECK + if (cur_free < 0) + reiserfs_panic (tb->tb_sb, "vs-8070: check_right: cur_free < 0"); +#endif + + /* internal level */ + if (h > 0) { + tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE); + return; + } + + /* leaf level */ + + if (!cur_free || !vn->vn_nr_item) { + /* no free space */ + tb->rnum[h] = 0; + tb->rbytes = -1; + return; + } + +#ifdef CONFIG_REISERFS_CHECK + if (!PATH_H_PPARENT (tb->tb_path, 0)) + reiserfs_panic (tb->tb_sb, "vs-8075: check_right: parent does not exist or invalid"); +#endif + + vi = vn->vn_vi + vn->vn_nr_item - 1; + if ((unsigned int)cur_free >= (vn->vn_size - ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) { + /* all contents of S[0] fits into R[0] */ + +#ifdef CONFIG_REISERFS_CHECK + if (vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE) + reiserfs_panic (tb->tb_sb, "vs-8080: check_right: invalid mode or balance condition failed"); +#endif + + tb->rnum[h] = vn->vn_nr_item; + tb->rbytes = -1; + return; + } + + d_size = 0, ih_size = IH_SIZE; + + /* last item may be merge with first item in right neighbor */ + if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) + d_size = -(int)IH_SIZE, ih_size = 0; + + tb->rnum[0] = 0; + for (i = vn->vn_nr_item - 1; i >= 0; i --, d_size = 0, ih_size = IH_SIZE, vi --) { + d_size += vi->vi_item_len; + if (cur_free >= d_size) { + /* the item can be shifted entirely */ + cur_free -= d_size; + tb->rnum[0] ++; + continue; + } + + /* check whether R[0] can hold ih and at least one byte of the item body */ + if ( cur_free <= ih_size ) { /* cannot shift even a part of the current item */ + tb->rbytes = -1; + return; + } + + /* R[0] can hold the header of the item and at least one byte of its body */ + cur_free -= ih_size; /* cur_free is still > 0 */ + + tb->rbytes = op_check_right (vi, cur_free); + if (tb->rbytes != -1) + /* count partially shifted item */ + tb->rnum[0] ++; + + break; + } + + return; +} + + +/* + * from - number of items, which are shifted to left neighbor entirely + * to - number of item, which are shifted to right neighbor entirely + * from_bytes - number of bytes of boundary item (or directory entries) which are shifted to left neighbor + * to_bytes - number of bytes of boundary item (or directory entries) which are shifted to right neighbor */ +static int get_num_ver (int mode, struct tree_balance * tb, int h, + int from, int from_bytes, + int to, int to_bytes, + short * snum012, int flow + ) +{ + int i; + int cur_free; + // int bytes; + int units; + struct virtual_node * vn = tb->tb_vn; + // struct virtual_item * vi; + + int total_node_size, max_node_size, current_item_size; + int needed_nodes; + int start_item, /* position of item we start filling node from */ + end_item, /* position of item we finish filling node by */ + start_bytes,/* number of first bytes (entries for directory) of start_item-th item + we do not include into node that is being filled */ + end_bytes; /* number of last bytes (entries for directory) of end_item-th item + we do node include into node that is being filled */ + int split_item_positions[2]; /* these are positions in virtual item of + items, that are split between S[0] and + S1new and S1new and S2new */ + + split_item_positions[0] = -1; + split_item_positions[1] = -1; + +#ifdef CONFIG_REISERFS_CHECK + /* We only create additional nodes if we are in insert or paste mode + or we are in replace mode at the internal level. If h is 0 and + the mode is M_REPLACE then in fix_nodes we change the mode to + paste or insert before we get here in the code. */ + if ( tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE)) + reiserfs_panic (0, "vs-8100: get_num_ver: insert_size < 0 in overflow"); +#endif + + max_node_size = MAX_CHILD_SIZE (PATH_H_PBUFFER (tb->tb_path, h)); + + /* snum012 [0-2] - number of items, that lay + to S[0], first new node and second new node */ + snum012[3] = -1; /* s1bytes */ + snum012[4] = -1; /* s2bytes */ + + /* internal level */ + if (h > 0) { + i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE); + if (i == max_node_size) + return 1; + return (i / max_node_size + 1); + } + + /* leaf level */ + needed_nodes = 1; + total_node_size = 0; + cur_free = max_node_size; + + // start from 'from'-th item + start_item = from; + // skip its first 'start_bytes' units + start_bytes = ((from_bytes != -1) ? from_bytes : 0); + + // last included item is the 'end_item'-th one + end_item = vn->vn_nr_item - to - 1; + // do not count last 'end_bytes' units of 'end_item'-th item + end_bytes = (to_bytes != -1) ? to_bytes : 0; + + /* go through all item begining from the start_item-th item and ending by + the end_item-th item. Do not count first 'start_bytes' units of + 'start_item'-th item and last 'end_bytes' of 'end_item'-th item */ + + for (i = start_item; i <= end_item; i ++) { + struct virtual_item * vi = vn->vn_vi + i; + int skip_from_end = ((i == end_item) ? end_bytes : 0); + +#ifdef CONFIG_REISERFS_CHECK + if (needed_nodes > 3) { + reiserfs_panic (tb->tb_sb, "vs-8105: get_num_ver: too many nodes are needed"); + } +#endif + + /* get size of current item */ + current_item_size = vi->vi_item_len; + + /* do not take in calculation head part (from_bytes) of from-th item */ + current_item_size -= op_part_size (vi, 0/*from start*/, start_bytes); + + /* do not take in calculation tail part of last item */ + current_item_size -= op_part_size (vi, 1/*from end*/, skip_from_end); + + /* if item fits into current node entierly */ + if (total_node_size + current_item_size <= max_node_size) { + snum012[needed_nodes - 1] ++; + total_node_size += current_item_size; + start_bytes = 0; + continue; + } + + if (current_item_size > max_node_size) { + /* virtual item length is longer, than max size of item in + a node. It is impossible for direct item */ +#ifdef CONFIG_REISERFS_CHECK + if (is_direct_le_ih (vi->vi_ih)) + reiserfs_panic (tb->tb_sb, "vs-8110: get_num_ver: " + "direct item length is %d. It can not be longer than %d", + current_item_size, max_node_size); +#endif + /* we will try to split it */ + flow = 1; + } + + if (!flow) { + /* as we do not split items, take new node and continue */ + needed_nodes ++; i --; total_node_size = 0; + continue; + } + + // calculate number of item units which fit into node being + // filled + { + int free_space; + + free_space = max_node_size - total_node_size - IH_SIZE; + units = op_check_left (vi, free_space, start_bytes, skip_from_end); + if (units == -1) { + /* nothing fits into current node, take new node and continue */ + needed_nodes ++, i--, total_node_size = 0; + continue; + } + } + + /* something fits into the current node */ + //if (snum012[3] != -1 || needed_nodes != 1) + // reiserfs_panic (tb->tb_sb, "vs-8115: get_num_ver: too many nodes required"); + //snum012[needed_nodes - 1 + 3] = op_unit_num (vi) - start_bytes - units; + start_bytes += units; + snum012[needed_nodes - 1 + 3] = units; + + if (needed_nodes > 2) + reiserfs_warning ("vs-8111: get_num_ver: split_item_position is out of boundary\n"); + snum012[needed_nodes - 1] ++; + split_item_positions[needed_nodes - 1] = i; + needed_nodes ++; + /* continue from the same item with start_bytes != -1 */ + start_item = i; + i --; + total_node_size = 0; + } + + // sum012[4] (if it is not -1) contains number of units of which + // are to be in S1new, snum012[3] - to be in S0. They are supposed + // to be S1bytes and S2bytes correspondingly, so recalculate + if (snum012[4] > 0) { + int split_item_num; + int bytes_to_r, bytes_to_l; + int bytes_to_S1new; + + split_item_num = split_item_positions[1]; + bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0); + bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0); + bytes_to_S1new = ((split_item_positions[0] == split_item_positions[1]) ? snum012[3] : 0); + + // s2bytes + snum012[4] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[4] - bytes_to_r - bytes_to_l - bytes_to_S1new; + + if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY) + reiserfs_warning ("vs-8115: get_num_ver: not directory item\n"); + } + + /* now we know S2bytes, calculate S1bytes */ + if (snum012[3] > 0) { + int split_item_num; + int bytes_to_r, bytes_to_l; + int bytes_to_S2new; + + split_item_num = split_item_positions[0]; + bytes_to_l = ((from == split_item_num && from_bytes != -1) ? from_bytes : 0); + bytes_to_r = ((end_item == split_item_num && end_bytes != -1) ? end_bytes : 0); + bytes_to_S2new = ((split_item_positions[0] == split_item_positions[1] && snum012[4] != -1) ? snum012[4] : 0); + + // s1bytes + snum012[3] = op_unit_num (&vn->vn_vi[split_item_num]) - snum012[3] - bytes_to_r - bytes_to_l - bytes_to_S2new; + } + + return needed_nodes; +} + + +#ifdef CONFIG_REISERFS_CHECK +extern struct tree_balance * cur_tb; +#endif + + +/* Set parameters for balancing. + * Performs write of results of analysis of balancing into structure tb, + * where it will later be used by the functions that actually do the balancing. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * lnum number of items from S[h] that must be shifted to L[h]; + * rnum number of items from S[h] that must be shifted to R[h]; + * blk_num number of blocks that S[h] will be splitted into; + * s012 number of items that fall into splitted nodes. + * lbytes number of bytes which flow to the left neighbor from the item that is not + * not shifted entirely + * rbytes number of bytes which flow to the right neighbor from the item that is not + * not shifted entirely + * s1bytes number of bytes which flow to the first new node when S[0] splits (this number is contained in s012 array) + */ + +static void set_parameters (struct tree_balance * tb, int h, int lnum, + int rnum, int blk_num, short * s012, int lb, int rb) +{ + + tb->lnum[h] = lnum; + tb->rnum[h] = rnum; + tb->blknum[h] = blk_num; + + if (h == 0) + { /* only for leaf level */ + if (s012 != NULL) + { + tb->s0num = * s012 ++, + tb->s1num = * s012 ++, + tb->s2num = * s012 ++; + tb->s1bytes = * s012 ++; + tb->s2bytes = * s012; + } + tb->lbytes = lb; + tb->rbytes = rb; + } +} + + + +/* check, does node disappear if we shift tb->lnum[0] items to left + neighbor and tb->rnum[0] to the right one. */ +static int is_leaf_removable (struct tree_balance * tb) +{ + struct virtual_node * vn = tb->tb_vn; + int to_left, to_right; + int size; + int remain_items; + + /* number of items, that will be shifted to left (right) neighbor + entirely */ + to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0); + to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0); + remain_items = vn->vn_nr_item; + + /* how many items remain in S[0] after shiftings to neighbors */ + remain_items -= (to_left + to_right); + + if (remain_items < 1) { + /* all content of node can be shifted to neighbors */ + set_parameters (tb, 0, to_left, vn->vn_nr_item - to_left, 0, NULL, -1, -1); + return 1; + } + + if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1) + /* S[0] is not removable */ + return 0; + + /* check, whether we can divide 1 remaining item between neighbors */ + + /* get size of remaining item (in item units) */ + size = op_unit_num (&(vn->vn_vi[to_left])); + + if (tb->lbytes + tb->rbytes >= size) { + set_parameters (tb, 0, to_left + 1, to_right + 1, 0, NULL, tb->lbytes, -1); + return 1; + } + + return 0; +} + + +/* check whether L, S, R can be joined in one node */ +static int are_leaves_removable (struct tree_balance * tb, int lfree, int rfree) +{ + struct virtual_node * vn = tb->tb_vn; + int ih_size; + struct buffer_head *S0; + + S0 = PATH_H_PBUFFER (tb->tb_path, 0); + + ih_size = 0; + if (vn->vn_nr_item) { + if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE) + ih_size += IH_SIZE; + + if (vn->vn_vi[vn->vn_nr_item-1].vi_type & VI_TYPE_RIGHT_MERGEABLE) + ih_size += IH_SIZE; + } else { + /* there was only one item and it will be deleted */ + struct item_head * ih; + +#ifdef CONFIG_REISERFS_CHECK + if (B_NR_ITEMS (S0) != 1) + reiserfs_panic (0, "vs-8125: are_leaves_removable: item number must be 1: it is %d", B_NR_ITEMS(S0)); +#endif + + ih = B_N_PITEM_HEAD (S0, 0); + if (tb->CFR[0] && !comp_short_le_keys (&(ih->ih_key), B_N_PDELIM_KEY (tb->CFR[0], tb->rkey[0]))) + if (is_direntry_le_ih (ih)) { +#ifndef REISERFS_FSCK + + /* Directory must be in correct state here: that is + somewhere at the left side should exist first directory + item. But the item being deleted can not be that first + one because its right neighbor is item of the same + directory. (But first item always gets deleted in last + turn). So, neighbors of deleted item can be merged, so + we can save ih_size */ + ih_size = IH_SIZE; + +#ifdef CONFIG_REISERFS_CHECK + /* we might check that left neighbor exists and is of the + same directory */ + if (le_key_k_offset (ih_version (ih), &(ih->ih_key)) == DOT_OFFSET) + reiserfs_panic (tb->tb_sb, "vs-8130: are_leaves_removable: " + "first directory item can not be removed until directory is not empty"); +#endif + + +#else /* REISERFS_FSCK */ + + /* we can delete any directory item in fsck (if it is unreachable) */ + if (ih->ih_key.k_offset != DOT_OFFSET) { + /* must get left neighbor here to make sure, that left + neighbor is of the same directory */ + struct buffer_head * left; + + left = get_left_neighbor (tb->tb_sb, tb->tb_path); + if (left) { + struct item_head * last; + + if (B_NR_ITEMS (left) == 0) + reiserfs_panic (tb->tb_sb, "vs-8135: are_leaves_removable: " + "empty node in the tree"); + last = B_N_PITEM_HEAD (left, B_NR_ITEMS (left) - 1); + if (!comp_short_keys (&last->ih_key, &ih->ih_key)) + ih_size = IH_SIZE; + brelse (left); + } + } +#endif + } + + } + + if (MAX_CHILD_SIZE (S0) + vn->vn_size <= rfree + lfree + ih_size) { + set_parameters (tb, 0, -1, -1, -1, NULL, -1, -1); + return 1; + } + return 0; + +} + + + +/* when we do not split item, lnum and rnum are numbers of entire items */ +#define SET_PAR_SHIFT_LEFT \ +if (h)\ +{\ + int to_l;\ + \ + to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\ + (MAX_NR_KEY(Sh) + 1 - lpar);\ + \ + set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\ +}\ +else \ +{\ + if (lset==LEFT_SHIFT_FLOW)\ + set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\ + tb->lbytes, -1);\ + else\ + set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\ + -1, -1);\ +} + + +#define SET_PAR_SHIFT_RIGHT \ +if (h)\ +{\ + int to_r;\ + \ + to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\ + \ + set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\ +}\ +else \ +{\ + if (rset==RIGHT_SHIFT_FLOW)\ + set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\ + -1, tb->rbytes);\ + else\ + set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\ + -1, -1);\ +} + + +void free_buffers_in_tb ( + struct tree_balance * p_s_tb + ) { + int n_counter; + + decrement_counters_in_path(p_s_tb->tb_path); + + for ( n_counter = 0; n_counter < MAX_HEIGHT; n_counter++ ) { + decrement_bcount(p_s_tb->L[n_counter]); + p_s_tb->L[n_counter] = NULL; + decrement_bcount(p_s_tb->R[n_counter]); + p_s_tb->R[n_counter] = NULL; + decrement_bcount(p_s_tb->FL[n_counter]); + p_s_tb->FL[n_counter] = NULL; + decrement_bcount(p_s_tb->FR[n_counter]); + p_s_tb->FR[n_counter] = NULL; + decrement_bcount(p_s_tb->CFL[n_counter]); + p_s_tb->CFL[n_counter] = NULL; + decrement_bcount(p_s_tb->CFR[n_counter]); + p_s_tb->CFR[n_counter] = NULL; + } +} + + +/* Get new buffers for storing new nodes that are created while balancing. + * Returns: SCHEDULE_OCCURED - schedule occured while the function worked; + * CARRY_ON - schedule didn't occur while the function worked; + * NO_DISK_SPACE - no disk space. + */ +/* The function is NOT SCHEDULE-SAFE! */ +static int get_empty_nodes( + struct tree_balance * p_s_tb, + int n_h + ) { + struct buffer_head * p_s_new_bh, + * p_s_Sh = PATH_H_PBUFFER (p_s_tb->tb_path, n_h); + unsigned long * p_n_blocknr, + a_n_blocknrs[MAX_AMOUNT_NEEDED] = {0, }; + int n_counter, + n_number_of_freeblk, + n_amount_needed,/* number of needed empty blocks */ + n_retval = CARRY_ON; + struct super_block * p_s_sb = p_s_tb->tb_sb; + + +#ifdef REISERFS_FSCK + if (n_h == 0 && p_s_tb->insert_size[n_h] == 0x7fff) + return CARRY_ON; +#endif + + /* number_of_freeblk is the number of empty blocks which have been + acquired for use by the balancing algorithm minus the number of + empty blocks used in the previous levels of the analysis, + number_of_freeblk = tb->cur_blknum can be non-zero if a schedule occurs + after empty blocks are acquired, and the balancing analysis is + then restarted, amount_needed is the number needed by this level + (n_h) of the balancing analysis. + + Note that for systems with many processes writing, it would be + more layout optimal to calculate the total number needed by all + levels and then to run reiserfs_new_blocks to get all of them at once. */ + + /* Initiate number_of_freeblk to the amount acquired prior to the restart of + the analysis or 0 if not restarted, then subtract the amount needed + by all of the levels of the tree below n_h. */ + /* blknum includes S[n_h], so we subtract 1 in this calculation */ + for ( n_counter = 0, n_number_of_freeblk = p_s_tb->cur_blknum; n_counter < n_h; n_counter++ ) + n_number_of_freeblk -= ( p_s_tb->blknum[n_counter] ) ? (p_s_tb->blknum[n_counter] - 1) : 0; + + /* Allocate missing empty blocks. */ + /* if p_s_Sh == 0 then we are getting a new root */ + n_amount_needed = ( p_s_Sh ) ? (p_s_tb->blknum[n_h] - 1) : 1; + /* Amount_needed = the amount that we need more than the amount that we have. */ + if ( n_amount_needed > n_number_of_freeblk ) + n_amount_needed -= n_number_of_freeblk; + else /* If we have enough already then there is nothing to do. */ + return CARRY_ON; + + if ( reiserfs_new_blocknrs (p_s_tb->transaction_handle, a_n_blocknrs, + PATH_PLAST_BUFFER(p_s_tb->tb_path)->b_blocknr, n_amount_needed) == NO_DISK_SPACE ) + return NO_DISK_SPACE; + + /* for each blocknumber we just got, get a buffer and stick it on FEB */ + for ( p_n_blocknr = a_n_blocknrs, n_counter = 0; n_counter < n_amount_needed; + p_n_blocknr++, n_counter++ ) { + +#ifdef CONFIG_REISERFS_CHECK + if ( ! *p_n_blocknr ) + reiserfs_panic(p_s_sb, "PAP-8135: get_empty_nodes: reiserfs_new_blocknrs failed when got new blocks"); +#endif + + p_s_new_bh = reiserfs_getblk(p_s_sb->s_dev, *p_n_blocknr, p_s_sb->s_blocksize); + if (atomic_read (&(p_s_new_bh->b_count)) > 1) { +/*&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&*/ +/* + reiserfs_warning ("waiting for buffer %b, iput inode pid = %d, this pid %d, mode %c, %h\n", + p_s_new_bh, put_inode_pid, current->pid, p_s_tb->tb_vn->vn_mode, p_s_tb->tb_vn->vn_ins_ih); + print_tb (0, 0, 0, p_s_tb, "tb"); +*/ +/*&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&*/ + if (atomic_read(&(p_s_new_bh->b_count)) > 2 || + !(buffer_journaled(p_s_new_bh) || buffer_journal_dirty(p_s_new_bh))) { + n_retval = REPEAT_SEARCH ; + free_buffers_in_tb (p_s_tb); + wait_buffer_until_released (p_s_new_bh); + } + } +#ifdef CONFIG_REISERFS_CHECK + if (atomic_read (&(p_s_new_bh->b_count)) != 1 || buffer_dirty (p_s_new_bh)) { + if (atomic_read(&(p_s_new_bh->b_count)) > 2 || + !(buffer_journaled(p_s_new_bh) || buffer_journal_dirty(p_s_new_bh))) { + reiserfs_panic(p_s_sb,"PAP-8140: get_empty_nodes: not free or dirty buffer %b for the new block", + p_s_new_bh); + } + } +#endif + + /* Put empty buffers into the array. */ + if (p_s_tb->FEB[p_s_tb->cur_blknum]) + BUG(); + + p_s_tb->FEB[p_s_tb->cur_blknum++] = p_s_new_bh; + } + + if ( n_retval == CARRY_ON && FILESYSTEM_CHANGED_TB (p_s_tb) ) + n_retval = REPEAT_SEARCH ; + + return n_retval; +} + + +/* Get free space of the left neighbor, which is stored in the parent + * node of the left neighbor. */ +static int get_lfree (struct tree_balance * tb, int h) +{ + struct buffer_head * l, * f; + int order; + + if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (l = tb->FL[h]) == 0) + return 0; + + if (f == l) + order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) - 1; + else { + order = B_NR_ITEMS (l); + f = l; + } + + return (MAX_CHILD_SIZE(f) - le16_to_cpu (B_N_CHILD(f,order)->dc_size)); +} + + +/* Get free space of the right neighbor, + * which is stored in the parent node of the right neighbor. + */ +static int get_rfree (struct tree_balance * tb, int h) +{ + struct buffer_head * r, * f; + int order; + + if ((f = PATH_H_PPARENT (tb->tb_path, h)) == 0 || (r = tb->FR[h]) == 0) + return 0; + + if (f == r) + order = PATH_H_B_ITEM_ORDER (tb->tb_path, h) + 1; + else { + order = 0; + f = r; + } + + return (MAX_CHILD_SIZE(f) - B_N_CHILD(f,order)->dc_size); + +} + + +/* Check whether left neighbor is in memory. */ +static int is_left_neighbor_in_cache( + struct tree_balance * p_s_tb, + int n_h + ) { + struct buffer_head * p_s_father, * left; + struct super_block * p_s_sb = p_s_tb->tb_sb; + unsigned long n_left_neighbor_blocknr; + int n_left_neighbor_position; + + if ( ! p_s_tb->FL[n_h] ) /* Father of the left neighbor does not exist. */ + return 0; + + /* Calculate father of the node to be balanced. */ + p_s_father = PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1); + +#ifdef CONFIG_REISERFS_CHECK + if ( ! p_s_father || ! B_IS_IN_TREE (p_s_father) || ! B_IS_IN_TREE (p_s_tb->FL[n_h]) || + ! buffer_uptodate (p_s_father) || ! buffer_uptodate (p_s_tb->FL[n_h]) ) { + reiserfs_panic (p_s_sb, "vs-8165: is_left_neighbor_in_cache: F[h] (%b) or FL[h] (%b) is invalid", + p_s_father, p_s_tb->FL[n_h]); + } +#endif + + + /* Get position of the pointer to the left neighbor into the left father. */ + n_left_neighbor_position = ( p_s_father == p_s_tb->FL[n_h] ) ? + p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]); + /* Get left neighbor block number. */ + n_left_neighbor_blocknr = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position); + /* Look for the left neighbor in the cache. */ + if ( (left = get_hash_table(p_s_sb->s_dev, n_left_neighbor_blocknr, p_s_sb->s_blocksize)) ) { + +#ifdef CONFIG_REISERFS_CHECK + if ( buffer_uptodate (left) && ! B_IS_IN_TREE(left) ) { + reiserfs_panic(p_s_sb, "vs-8170: is_left_neighbor_in_cache: left neighbor (%b %z) is not in the tree", + left, left); + } +#endif + atomic_dec (&(left->b_count)); + return 1; + } + + return 0; +} + + +#define LEFT_PARENTS 'l' +#define RIGHT_PARENTS 'r' + + +static void decrement_key (struct cpu_key * p_s_key) +{ + // call item specific function for this key + item_ops[cpu_key_k_type (p_s_key)]->decrement_key (p_s_key); + + +#if 0 /* this works wrong when key is key of second part of tail: it + sets key to be of indirect type. It looks like it makes no + harm but it is unclear */ + + unsigned long * p_n_key_field = (unsigned long *)p_s_key + REISERFS_FULL_KEY_LEN - 1; + int n_counter; + + for( n_counter = 0; n_counter < REISERFS_FULL_KEY_LEN; n_counter++, p_n_key_field-- ) { + if ( *p_n_key_field ) { + (*p_n_key_field)--; + break; + } + } +#ifdef CONFIG_REISERFS_CHECK + if ( n_counter == REISERFS_FULL_KEY_LEN ) + reiserfs_panic(NULL, "PAP-8175: decrement_key: zero key"); +#endif + +#endif /*0*/ + +} + + + + +/* Calculate far left/right parent of the left/right neighbor of the current node, that + * is calculate the left/right (FL[h]/FR[h]) neighbor of the parent F[h]. + * Calculate left/right common parent of the current node and L[h]/R[h]. + * Calculate left/right delimiting key position. + * Returns: PATH_INCORRECT - path in the tree is not correct; + SCHEDULE_OCCURRED - schedule occured while the function worked; + * CARRY_ON - schedule didn't occur while the function worked; + */ +static int get_far_parent (struct tree_balance * p_s_tb, + int n_h, + struct buffer_head ** pp_s_father, + struct buffer_head ** pp_s_com_father, + char c_lr_par) +{ + struct buffer_head * p_s_parent; + INITIALIZE_PATH (s_path_to_neighbor_father); + struct path * p_s_path = p_s_tb->tb_path; + struct cpu_key s_lr_father_key; + int n_counter, + n_position = MAX_INT, + n_first_last_position = 0, + n_path_offset = PATH_H_PATH_OFFSET(p_s_path, n_h); + + /* Starting from F[n_h] go upwards in the tree, and look for the common + ancestor of F[n_h], and its neighbor l/r, that should be obtained. */ + + n_counter = n_path_offset; + +#ifdef CONFIG_REISERFS_CHECK + if ( n_counter < FIRST_PATH_ELEMENT_OFFSET ) + reiserfs_panic(p_s_tb->tb_sb, "PAP-8180: get_far_parent: invalid path length"); +#endif + + + for ( ; n_counter > FIRST_PATH_ELEMENT_OFFSET; n_counter-- ) { + /* Check whether parent of the current buffer in the path is really parent in the tree. */ + if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_path, n_counter - 1)) ) + return REPEAT_SEARCH; + /* Check whether position in the parent is correct. */ + if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_counter - 1)) > B_NR_ITEMS(p_s_parent) ) + return REPEAT_SEARCH; + /* Check whether parent at the path really points to the child. */ + if ( B_N_CHILD_NUM(p_s_parent, n_position) != + PATH_OFFSET_PBUFFER(p_s_path, n_counter)->b_blocknr ) + return REPEAT_SEARCH; + /* Return delimiting key if position in the parent is not equal to first/last one. */ + if ( c_lr_par == RIGHT_PARENTS ) + n_first_last_position = B_NR_ITEMS (p_s_parent); + if ( n_position != n_first_last_position ) { + *pp_s_com_father = p_s_parent; + atomic_inc (&((*pp_s_com_father)->b_count)); + /*(*pp_s_com_father = p_s_parent)->b_count++;*/ + break; + } + } + + /* if we are in the root of the tree, then there is no common father */ + if ( n_counter == FIRST_PATH_ELEMENT_OFFSET ) { + /* Check whether first buffer in the path is the root of the tree. */ + if ( PATH_OFFSET_PBUFFER(p_s_tb->tb_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == + SB_ROOT_BLOCK (p_s_tb->tb_sb) ) { + *pp_s_father = *pp_s_com_father = NULL; + return CARRY_ON; + } + return REPEAT_SEARCH; + } + +#ifdef CONFIG_REISERFS_CHECK + if ( B_LEVEL (*pp_s_com_father) <= DISK_LEAF_NODE_LEVEL ) { + reiserfs_panic(p_s_tb->tb_sb, "PAP-8185: get_far_parent: (%b %z) level too small", *pp_s_com_father, *pp_s_com_father); + } +#endif + + /* Check whether the common parent is locked. */ + + if ( buffer_locked (*pp_s_com_father) ) { + __wait_on_buffer(*pp_s_com_father); + if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { + decrement_bcount(*pp_s_com_father); + return REPEAT_SEARCH; + } + } + + /* So, we got common parent of the current node and its left/right neighbor. + Now we are geting the parent of the left/right neighbor. */ + + /* Form key to get parent of the left/right neighbor. */ + le_key2cpu_key (&s_lr_father_key, B_N_PDELIM_KEY(*pp_s_com_father, ( c_lr_par == LEFT_PARENTS ) ? + (p_s_tb->lkey[n_h - 1] = n_position - 1) : (p_s_tb->rkey[n_h - 1] = n_position))); + + + if ( c_lr_par == LEFT_PARENTS ) + decrement_key(&s_lr_father_key); + + if (search_by_key(p_s_tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, n_h + 1) == IO_ERROR) + // path is released + return IO_ERROR; + + if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { + decrement_counters_in_path(&s_path_to_neighbor_father); + decrement_bcount(*pp_s_com_father); + return REPEAT_SEARCH; + } + + *pp_s_father = PATH_PLAST_BUFFER(&s_path_to_neighbor_father); + +#ifdef CONFIG_REISERFS_CHECK + if ( B_LEVEL (*pp_s_father) != n_h + 1 ) { + reiserfs_panic(p_s_tb->tb_sb, "PAP-8190: get_far_parent: (%b %z) level too small", *pp_s_father, *pp_s_father); + } + + if ( s_path_to_neighbor_father.path_length < FIRST_PATH_ELEMENT_OFFSET ) + reiserfs_panic(0, "PAP-8192: get_far_parent: path length is too small"); + +#endif + + s_path_to_neighbor_father.path_length--; + decrement_counters_in_path(&s_path_to_neighbor_father); + return CARRY_ON; +} + + +/* Get parents of neighbors of node in the path(S[n_path_offset]) and common parents of + * S[n_path_offset] and L[n_path_offset]/R[n_path_offset]: F[n_path_offset], FL[n_path_offset], + * FR[n_path_offset], CFL[n_path_offset], CFR[n_path_offset]. + * Calculate numbers of left and right delimiting keys position: lkey[n_path_offset], rkey[n_path_offset]. + * Returns: SCHEDULE_OCCURRED - schedule occured while the function worked; + * CARRY_ON - schedule didn't occur while the function worked; + */ +static int get_parents (struct tree_balance * p_s_tb, int n_h) +{ + struct path * p_s_path = p_s_tb->tb_path; + int n_position, + n_ret_value, + n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h); + struct buffer_head * p_s_curf, + * p_s_curcf; + + /* Current node is the root of the tree or will be root of the tree */ + if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) { + /* The root can not have parents. + Release nodes which previously were obtained as parents of the current node neighbors. */ + decrement_bcount(p_s_tb->FL[n_h]); + decrement_bcount(p_s_tb->CFL[n_h]); + decrement_bcount(p_s_tb->FR[n_h]); + decrement_bcount(p_s_tb->CFR[n_h]); + p_s_tb->FL[n_h] = p_s_tb->CFL[n_h] = p_s_tb->FR[n_h] = p_s_tb->CFR[n_h] = NULL; + return CARRY_ON; + } + + /* Get parent FL[n_path_offset] of L[n_path_offset]. */ + if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) ) { + /* Current node is not the first child of its parent. */ + /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/ + p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1); + atomic_inc (&(p_s_curf->b_count)); + atomic_inc (&(p_s_curf->b_count)); + p_s_tb->lkey[n_h] = n_position - 1; + } + else { + /* Calculate current parent of L[n_path_offset], which is the left neighbor of the current node. + Calculate current common parent of L[n_path_offset] and the current node. Note that + CFL[n_path_offset] not equal FL[n_path_offset] and CFL[n_path_offset] not equal F[n_path_offset]. + Calculate lkey[n_path_offset]. */ + if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf, + &p_s_curcf, LEFT_PARENTS)) != CARRY_ON ) + return n_ret_value; + } + + decrement_bcount(p_s_tb->FL[n_h]); + p_s_tb->FL[n_h] = p_s_curf; /* New initialization of FL[n_h]. */ + decrement_bcount(p_s_tb->CFL[n_h]); + p_s_tb->CFL[n_h] = p_s_curcf; /* New initialization of CFL[n_h]. */ + +#ifdef CONFIG_REISERFS_CHECK + if ((p_s_curf && !B_IS_IN_TREE (p_s_curf)) || (p_s_curcf && !B_IS_IN_TREE (p_s_curcf))) { + reiserfs_panic (p_s_tb->tb_sb, "PAP-8195: get_parents: FL (%b) or CFL (%b) is invalid", p_s_curf, p_s_curcf); + } +#endif + +/* Get parent FR[n_h] of R[n_h]. */ + +/* Current node is the last child of F[n_h]. FR[n_h] != F[n_h]. */ + if ( n_position == B_NR_ITEMS (PATH_H_PBUFFER(p_s_path, n_h + 1)) ) { +/* Calculate current parent of R[n_h], which is the right neighbor of F[n_h]. + Calculate current common parent of R[n_h] and current node. Note that CFR[n_h] + not equal FR[n_path_offset] and CFR[n_h] not equal F[n_h]. */ + if ( (n_ret_value = get_far_parent(p_s_tb, n_h + 1, &p_s_curf, &p_s_curcf, RIGHT_PARENTS)) != CARRY_ON ) + return n_ret_value; + } + else { +/* Current node is not the last child of its parent F[n_h]. */ + /*(p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1))->b_count += 2;*/ + p_s_curf = p_s_curcf = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1); + atomic_inc (&(p_s_curf->b_count)); + atomic_inc (&(p_s_curf->b_count)); + p_s_tb->rkey[n_h] = n_position; + } + + decrement_bcount(p_s_tb->FR[n_h]); + p_s_tb->FR[n_h] = p_s_curf; /* New initialization of FR[n_path_offset]. */ + + decrement_bcount(p_s_tb->CFR[n_h]); + p_s_tb->CFR[n_h] = p_s_curcf; /* New initialization of CFR[n_path_offset]. */ + +#ifdef CONFIG_REISERFS_CHECK +#if 0 + if (n_h == 0 && p_s_tb->CFR[n_h] && COMP_KEYS (B_PRIGHT_DELIM_KEY (PATH_H_PBUFFER(p_s_path, n_h)), + B_N_PDELIM_KEY (p_s_tb->CFR[n_h], p_s_tb->rkey[n_h]))) { + reiserfs_panic (p_s_tb->tb_sb, "PAP-8200: get_parents: rdkey in S0 %k, rdkey in CFR0 %k do not match", + B_PRIGHT_DELIM_KEY (PATH_H_PBUFFER(p_s_path, n_h)), B_N_PDELIM_KEY (p_s_tb->CFR[n_h], p_s_tb->rkey[n_h])); + } +#endif + if ((p_s_curf && !B_IS_IN_TREE (p_s_curf)) || (p_s_curcf && !B_IS_IN_TREE (p_s_curcf))) { + reiserfs_panic (p_s_tb->tb_sb, "PAP-8205: get_parents: FR (%b) or CFR (%b) is invalid", p_s_curf, p_s_curcf); + } +#endif + + return CARRY_ON; +} + + +/* it is possible to remove node as result of shiftings to + neighbors even when we insert or paste item. */ +static inline int can_node_be_removed (int mode, int lfree, int sfree, int rfree, struct tree_balance * tb, int h) +{ + struct buffer_head * Sh = PATH_H_PBUFFER (tb->tb_path, h); + int levbytes = tb->insert_size[h]; + struct item_head * ih; + struct key * r_key = NULL; + + ih = B_N_PITEM_HEAD (Sh, 0); + if ( tb->CFR[h] ) + r_key = B_N_PDELIM_KEY(tb->CFR[h],tb->rkey[h]); + + if ( + lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes + /* shifting may merge items which might save space */ +#ifdef REISERFS_FSCK + - (( ! h && is_left_mergeable (tb->tb_sb, tb->tb_path) == 1 ) ? IH_SIZE : 0) + - (( ! h && r_ih && is_right_mergeable (tb->tb_sb, tb->tb_path) == 1 ) ? IH_SIZE : 0) +#else + - (( ! h && op_is_left_mergeable (&(ih->ih_key), Sh->b_size) ) ? IH_SIZE : 0) + - (( ! h && r_key && op_is_left_mergeable (r_key, Sh->b_size) ) ? IH_SIZE : 0) +#endif + + (( h ) ? KEY_SIZE : 0)) + { + /* node can not be removed */ + if (sfree >= levbytes ) { /* new item fits into node S[h] without any shifting */ + if ( ! h ) + tb->s0num = B_NR_ITEMS(Sh) + ((mode == M_INSERT ) ? 1 : 0); + set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + } + return !NO_BALANCING_NEEDED; +} + + + +/* Check whether current node S[h] is balanced when increasing its size by + * Inserting or Pasting. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste; + * Returns: 1 - schedule occured; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +/* ip means Inserting or Pasting */ +static int ip_check_balance (struct tree_balance * tb, int h) +{ + struct virtual_node * vn = tb->tb_vn; + int levbytes, /* Number of bytes that must be inserted into (value + is negative if bytes are deleted) buffer which + contains node being balanced. The mnemonic is + that the attempted change in node space used level + is levbytes bytes. */ + n_ret_value; + + int lfree, sfree, rfree /* free space in L, S and R */; + + /* nver is short for number of vertixes, and lnver is the number if + we shift to the left, rnver is the number if we shift to the + right, and lrnver is the number if we shift in both directions. + The goal is to minimize first the number of vertixes, and second, + the number of vertixes whose contents are changed by shifting, + and third the number of uncached vertixes whose contents are + changed by shifting and must be read from disk. */ + int nver, lnver, rnver, lrnver; + + /* used at leaf level only, S0 = S[0] is the node being balanced, + sInum [ I = 0,1,2 ] is the number of items that will + remain in node SI after balancing. S1 and S2 are new + nodes that might be created. */ + + /* we perform 8 calls to get_num_ver(). For each call we calculate five parameters. + where 4th parameter is s1bytes and 5th - s2bytes + */ + short snum012[40] = {0,}; /* s0num, s1num, s2num for 8 cases + 0,1 - do not shift and do not shift but bottle + 2 - shift only whole item to left + 3 - shift to left and bottle as much as possible + 4,5 - shift to right (whole items and as much as possible + 6,7 - shift to both directions (whole items and as much as possible) + */ + + /* Sh is the node whose balance is currently being checked */ + struct buffer_head * Sh; + +#ifdef REISERFS_FSCK + /* special mode for insert pointer to the most low internal node */ + if (h == 0 && vn->vn_mode == M_INTERNAL) { + /* blk_num == 2 is to get pointer inserted to the next level */ + set_parameters (tb, h, 0, 0, 2, NULL, -1, -1); + return 0; + } +#endif + + Sh = PATH_H_PBUFFER (tb->tb_path, h); + levbytes = tb->insert_size[h]; + + /* Calculate balance parameters for creating new root. */ + if ( ! Sh ) { + if ( ! h ) + reiserfs_panic (tb->tb_sb, "vs-8210: ip_check_balance: S[0] can not be 0"); + switch ( n_ret_value = get_empty_nodes (tb, h) ) { + case CARRY_ON: + set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */ + + case NO_DISK_SPACE: + case REPEAT_SEARCH: + return n_ret_value; + default: + reiserfs_panic(tb->tb_sb, "vs-8215: ip_check_balance: incorrect return value of get_empty_nodes"); + } + } + + if ( (n_ret_value = get_parents (tb, h)) != CARRY_ON ) /* get parents of S[h] neighbors. */ + return n_ret_value; + + sfree = B_FREE_SPACE (Sh); + + /* get free space of neighbors */ + rfree = get_rfree (tb, h); + lfree = get_lfree (tb, h); + + if (can_node_be_removed (vn->vn_mode, lfree, sfree, rfree, tb, h) == NO_BALANCING_NEEDED) + /* and new item fits into node S[h] without any shifting */ + return NO_BALANCING_NEEDED; + + create_virtual_node (tb, h); + + /* + determine maximal number of items we can shift to the left neighbor (in tb structure) + and the maximal number of bytes that can flow to the left neighbor + from the left most liquid item that cannot be shifted from S[0] entirely (returned value) + */ + check_left (tb, h, lfree); + + /* + determine maximal number of items we can shift to the right neighbor (in tb structure) + and the maximal number of bytes that can flow to the right neighbor + from the right most liquid item that cannot be shifted from S[0] entirely (returned value) + */ + check_right (tb, h, rfree); + + + /* all contents of internal node S[h] can be moved into its + neighbors, S[h] will be removed after balancing */ + if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) { + int to_r; + + /* Since we are working on internal nodes, and our internal + nodes have fixed size entries, then we can balance by the + number of items rather than the space they consume. In this + routine we set the left node equal to the right node, + allowing a difference of less than or equal to 1 child + pointer. */ + to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 - + (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); + set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1); + return CARRY_ON; + } + +#ifdef CONFIG_REISERFS_CHECK + /* this checks balance condition, that any two neighboring nodes can not fit in one node */ + if ( h && ( tb->lnum[h] >= vn->vn_nr_item + 1 || tb->rnum[h] >= vn->vn_nr_item + 1) ) + reiserfs_panic (tb->tb_sb, "vs-8220: ip_check_balance: tree is not balanced on internal level"); + + if ( ! h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) || + (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1)) )) + reiserfs_panic(tb->tb_sb, "vs-8225: ip_check_balance: tree is not balanced on leaf level"); +#endif + + /* all contents of S[0] can be moved into its neighbors + S[0] will be removed after balancing. */ + if (!h && is_leaf_removable (tb)) + return CARRY_ON; + + + /* why do we perform this check here rather than earlier?? + Answer: we can win 1 node in some cases above. Moreover we + checked it above, when we checked, that S[0] is not removable + in principle */ + if (sfree >= levbytes) { /* new item fits into node S[h] without any shifting */ + if ( ! h ) + tb->s0num = vn->vn_nr_item; + set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + + + { + int lpar, rpar, nset, lset, rset, lrset; + /* + * regular overflowing of the node + */ + + /* get_num_ver works in 2 modes (FLOW & NO_FLOW) + lpar, rpar - number of items we can shift to left/right neighbor (including splitting item) + nset, lset, rset, lrset - shows, whether flowing items give better packing + */ +#define FLOW 1 +#define NO_FLOW 0 /* do not any splitting */ + + /* we choose one the following */ +#define NOTHING_SHIFT_NO_FLOW 0 +#define NOTHING_SHIFT_FLOW 5 +#define LEFT_SHIFT_NO_FLOW 10 +#define LEFT_SHIFT_FLOW 15 +#define RIGHT_SHIFT_NO_FLOW 20 +#define RIGHT_SHIFT_FLOW 25 +#define LR_SHIFT_NO_FLOW 30 +#define LR_SHIFT_FLOW 35 + + + lpar = tb->lnum[h]; + rpar = tb->rnum[h]; + + + /* calculate number of blocks S[h] must be split into when + nothing is shifted to the neighbors, + as well as number of items in each part of the split node (s012 numbers), + and number of bytes (s1bytes) of the shared drop which flow to S1 if any */ + nset = NOTHING_SHIFT_NO_FLOW; + nver = get_num_ver (vn->vn_mode, tb, h, + 0, -1, h?vn->vn_nr_item:0, -1, + snum012, NO_FLOW); + + if (!h) + { + int nver1; + + /* note, that in this case we try to bottle between S[0] and S1 (S1 - the first new node) */ + nver1 = get_num_ver (vn->vn_mode, tb, h, + 0, -1, 0, -1, + snum012 + NOTHING_SHIFT_FLOW, FLOW); + if (nver > nver1) + nset = NOTHING_SHIFT_FLOW, nver = nver1; + } + + + /* calculate number of blocks S[h] must be split into when + l_shift_num first items and l_shift_bytes of the right most + liquid item to be shifted are shifted to the left neighbor, + as well as number of items in each part of the splitted node (s012 numbers), + and number of bytes (s1bytes) of the shared drop which flow to S1 if any + */ + lset = LEFT_SHIFT_NO_FLOW; + lnver = get_num_ver (vn->vn_mode, tb, h, + lpar - (( h || tb->lbytes == -1 ) ? 0 : 1), -1, h ? vn->vn_nr_item:0, -1, + snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW); + if (!h) + { + int lnver1; + + lnver1 = get_num_ver (vn->vn_mode, tb, h, + lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, 0, -1, + snum012 + LEFT_SHIFT_FLOW, FLOW); + if (lnver > lnver1) + lset = LEFT_SHIFT_FLOW, lnver = lnver1; + } + + + /* calculate number of blocks S[h] must be split into when + r_shift_num first items and r_shift_bytes of the left most + liquid item to be shifted are shifted to the right neighbor, + as well as number of items in each part of the splitted node (s012 numbers), + and number of bytes (s1bytes) of the shared drop which flow to S1 if any + */ + rset = RIGHT_SHIFT_NO_FLOW; + rnver = get_num_ver (vn->vn_mode, tb, h, + 0, -1, h ? (vn->vn_nr_item-rpar) : (rpar - (( tb->rbytes != -1 ) ? 1 : 0)), -1, + snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW); + if (!h) + { + int rnver1; + + rnver1 = get_num_ver (vn->vn_mode, tb, h, + 0, -1, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes, + snum012 + RIGHT_SHIFT_FLOW, FLOW); + + if (rnver > rnver1) + rset = RIGHT_SHIFT_FLOW, rnver = rnver1; + } + + + /* calculate number of blocks S[h] must be split into when + items are shifted in both directions, + as well as number of items in each part of the splitted node (s012 numbers), + and number of bytes (s1bytes) of the shared drop which flow to S1 if any + */ + lrset = LR_SHIFT_NO_FLOW; + lrnver = get_num_ver (vn->vn_mode, tb, h, + lpar - ((h || tb->lbytes == -1) ? 0 : 1), -1, h ? (vn->vn_nr_item-rpar):(rpar - ((tb->rbytes != -1) ? 1 : 0)), -1, + snum012 + LR_SHIFT_NO_FLOW, NO_FLOW); + if (!h) + { + int lrnver1; + + lrnver1 = get_num_ver (vn->vn_mode, tb, h, + lpar - ((tb->lbytes != -1) ? 1 : 0), tb->lbytes, (rpar - ((tb->rbytes != -1) ? 1 : 0)), tb->rbytes, + snum012 + LR_SHIFT_FLOW, FLOW); + if (lrnver > lrnver1) + lrset = LR_SHIFT_FLOW, lrnver = lrnver1; + } + + + + /* Our general shifting strategy is: + 1) to minimized number of new nodes; + 2) to minimized number of neighbors involved in shifting; + 3) to minimized number of disk reads; */ + + /* we can win TWO or ONE nodes by shifting in both directions */ + if (lrnver < lnver && lrnver < rnver) + { +#ifdef CONFIG_REISERFS_CHECK + if (h && (tb->lnum[h] != 1 || tb->rnum[h] != 1 || lrnver != 1 || rnver != 2 || lnver != 2 || h != 1)) + reiserfs_panic (0, "vs-8230: check_balance: bad h"); +#endif + if (lrset == LR_SHIFT_FLOW) + set_parameters (tb, h, tb->lnum[h], tb->rnum[h], lrnver, snum012 + lrset, + tb->lbytes, tb->rbytes); + else + set_parameters (tb, h, tb->lnum[h] - ((tb->lbytes == -1) ? 0 : 1), + tb->rnum[h] - ((tb->rbytes == -1) ? 0 : 1), lrnver, snum012 + lrset, -1, -1); + + return CARRY_ON; + } + + /* if shifting doesn't lead to better packing then don't shift */ + if (nver == lrnver) + { + set_parameters (tb, h, 0, 0, nver, snum012 + nset, -1, -1); + return CARRY_ON; + } + + + /* now we know that for better packing shifting in only one + direction either to the left or to the right is required */ + + /* if shifting to the left is better than shifting to the right */ + if (lnver < rnver) + { + SET_PAR_SHIFT_LEFT; + return CARRY_ON; + } + + /* if shifting to the right is better than shifting to the left */ + if (lnver > rnver) + { + SET_PAR_SHIFT_RIGHT; + return CARRY_ON; + } + + + /* now shifting in either direction gives the same number + of nodes and we can make use of the cached neighbors */ + if (is_left_neighbor_in_cache (tb,h)) + { + SET_PAR_SHIFT_LEFT; + return CARRY_ON; + } + + /* shift to the right independently on whether the right neighbor in cache or not */ + SET_PAR_SHIFT_RIGHT; + return CARRY_ON; + } +} + + +/* Check whether current node S[h] is balanced when Decreasing its size by + * Deleting or Cutting for INTERNAL node of S+tree. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste; + * Returns: 1 - schedule occured; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + * + * Note: Items of internal nodes have fixed size, so the balance condition for + * the internal part of S+tree is as for the B-trees. + */ +static int dc_check_balance_internal (struct tree_balance * tb, int h) +{ + struct virtual_node * vn = tb->tb_vn; + + /* Sh is the node whose balance is currently being checked, + and Fh is its father. */ + struct buffer_head * Sh, * Fh; + int maxsize, + n_ret_value; + int lfree, rfree /* free space in L and R */; + + Sh = PATH_H_PBUFFER (tb->tb_path, h); + Fh = PATH_H_PPARENT (tb->tb_path, h); + + maxsize = MAX_CHILD_SIZE(Sh); + +/* using tb->insert_size[h], which is negative in this case, create_virtual_node calculates: */ +/* new_nr_item = number of items node would have if operation is */ +/* performed without balancing (new_nr_item); */ + create_virtual_node (tb, h); + + if ( ! Fh ) + { /* S[h] is the root. */ + if ( vn->vn_nr_item > 0 ) + { + set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; /* no balancing for higher levels needed */ + } + /* new_nr_item == 0. + * Current root will be deleted resulting in + * decrementing the tree height. */ + set_parameters (tb, h, 0, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON ) + return n_ret_value; + + + /* get free space of neighbors */ + rfree = get_rfree (tb, h); + lfree = get_lfree (tb, h); + + /* determine maximal number of items we can fit into neighbors */ + check_left (tb, h, lfree); + check_right (tb, h, rfree); + + + if ( vn->vn_nr_item >= MIN_NR_KEY(Sh) ) + { /* Balance condition for the internal node is valid. + * In this case we balance only if it leads to better packing. */ + if ( vn->vn_nr_item == MIN_NR_KEY(Sh) ) + { /* Here we join S[h] with one of its neighbors, + * which is impossible with greater values of new_nr_item. */ + if ( tb->lnum[h] >= vn->vn_nr_item + 1 ) + { + /* All contents of S[h] can be moved to L[h]. */ + int n; + int order_L; + + order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; + n = B_N_CHILD(tb->FL[h],order_L)->dc_size / (DC_SIZE + KEY_SIZE); + set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + if ( tb->rnum[h] >= vn->vn_nr_item + 1 ) + { + /* All contents of S[h] can be moved to R[h]. */ + int n; + int order_R; + + order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : n + 1; + n = B_N_CHILD(tb->FR[h],order_R)->dc_size / (DC_SIZE + KEY_SIZE); + set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1); + return CARRY_ON; + } + } + + if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) + { + /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ + int to_r; + + to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 - + (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); + set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* Balancing does not lead to better packing. */ + set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + + /* Current node contain insufficient number of items. Balancing is required. */ + /* Check whether we can merge S[h] with left neighbor. */ + if (tb->lnum[h] >= vn->vn_nr_item + 1) + if (is_left_neighbor_in_cache (tb,h) || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) + { + int n; + int order_L; + + order_L = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; + n = B_N_CHILD(tb->FL[h],order_L)->dc_size / (DC_SIZE + KEY_SIZE); + set_parameters (tb, h, -n-1, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* Check whether we can merge S[h] with right neighbor. */ + if (tb->rnum[h] >= vn->vn_nr_item + 1) + { + int n; + int order_R; + + order_R = ((n=PATH_H_B_ITEM_ORDER(tb->tb_path, h))==B_NR_ITEMS(Fh)) ? 0 : (n + 1); + n = B_N_CHILD(tb->FR[h],order_R)->dc_size / (DC_SIZE + KEY_SIZE); + set_parameters (tb, h, 0, -n-1, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ + if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) + { + int to_r; + + to_r = ((MAX_NR_KEY(Sh)<<1)+2-tb->lnum[h]-tb->rnum[h]+vn->vn_nr_item+1)/2 - + (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); + set_parameters (tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* For internal nodes try to borrow item from a neighbor */ +#ifdef CONFIG_REISERFS_CHECK + if (!tb->FL[h] && !tb->FR[h]) + reiserfs_panic (0, "vs-8235: dc_check_balance_internal: trying to borrow for root"); +#endif + + /* Borrow one or two items from caching neighbor */ + if (is_left_neighbor_in_cache (tb,h) || !tb->FR[h]) + { + int from_l; + + from_l = (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item + 1) / 2 - (vn->vn_nr_item + 1); + set_parameters (tb, h, -from_l, 0, 1, NULL, -1, -1); + return CARRY_ON; + } + + set_parameters (tb, h, 0, -((MAX_NR_KEY(Sh)+1-tb->rnum[h]+vn->vn_nr_item+1)/2-(vn->vn_nr_item+1)), 1, + NULL, -1, -1); + return CARRY_ON; +} + + +/* Check whether current node S[h] is balanced when Decreasing its size by + * Deleting or Truncating for LEAF node of S+tree. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste; + * Returns: 1 - schedule occured; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +static int dc_check_balance_leaf (struct tree_balance * tb, int h) +{ + struct virtual_node * vn = tb->tb_vn; + + /* Number of bytes that must be deleted from + (value is negative if bytes are deleted) buffer which + contains node being balanced. The mnemonic is that the + attempted change in node space used level is levbytes bytes. */ + int levbytes; + /* the maximal item size */ + int maxsize, + n_ret_value; + /* S0 is the node whose balance is currently being checked, + and F0 is its father. */ + struct buffer_head * S0, * F0; + int lfree, rfree /* free space in L and R */; + + S0 = PATH_H_PBUFFER (tb->tb_path, 0); + F0 = PATH_H_PPARENT (tb->tb_path, 0); + + levbytes = tb->insert_size[h]; + + maxsize = MAX_CHILD_SIZE(S0); /* maximal possible size of an item */ + + if ( ! F0 ) + { /* S[0] is the root now. */ + +#ifdef CONFIG_REISERFS_CHECK + if ( -levbytes >= maxsize - B_FREE_SPACE (S0) ) + reiserfs_panic (tb->tb_sb, "vs-8240: dc_check_balance_leaf: attempt to create empty buffer tree"); +#endif + + set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; + } + + if ( (n_ret_value = get_parents(tb,h)) != CARRY_ON ) + return n_ret_value; + + /* get free space of neighbors */ + rfree = get_rfree (tb, h); + lfree = get_lfree (tb, h); + + create_virtual_node (tb, h); + + /* if 3 leaves can be merge to one, set parameters and return */ + if (are_leaves_removable (tb, lfree, rfree)) + return CARRY_ON; + + /* determine maximal number of items we can shift to the left/right neighbor + and the maximal number of bytes that can flow to the left/right neighbor + from the left/right most liquid item that cannot be shifted from S[0] entirely + */ + check_left (tb, h, lfree); + check_right (tb, h, rfree); + + /* check whether we can merge S with left neighbor. */ + if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1) + if (is_left_neighbor_in_cache (tb,h) || + ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */ + !tb->FR[h]) { + +#ifdef CONFIG_REISERFS_CHECK + if (!tb->FL[h]) + reiserfs_panic (0, "vs-8245: dc_check_balance_leaf: FL[h] must exist"); +#endif + + /* set parameter to merge S[0] with its left neighbor */ + set_parameters (tb, h, -1, 0, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* check whether we can merge S[0] with right neighbor. */ + if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) { + set_parameters (tb, h, 0, -1, 0, NULL, -1, -1); + return CARRY_ON; + } + + /* All contents of S[0] can be moved to the neighbors (L[0] & R[0]). Set parameters and return */ + if (is_leaf_removable (tb)) + return CARRY_ON; + + /* Balancing is not required. */ + tb->s0num = vn->vn_nr_item; + set_parameters (tb, h, 0, 0, 1, NULL, -1, -1); + return NO_BALANCING_NEEDED; +} + + + +/* Check whether current node S[h] is balanced when Decreasing its size by + * Deleting or Cutting. + * Calculate parameters for balancing for current level h. + * Parameters: + * tb tree_balance structure; + * h current level of the node; + * inum item number in S[h]; + * mode d - delete, c - cut. + * Returns: 1 - schedule occured; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +static int dc_check_balance (struct tree_balance * tb, int h) +{ + +#ifdef CONFIG_REISERFS_CHECK + if ( ! (PATH_H_PBUFFER (tb->tb_path, h)) ) + reiserfs_panic(tb->tb_sb, "vs-8250: dc_check_balance: S is not initialized"); +#endif + + if ( h ) + return dc_check_balance_internal (tb, h); + else + return dc_check_balance_leaf (tb, h); +} + + + +/* Check whether current node S[h] is balanced. + * Calculate parameters for balancing for current level h. + * Parameters: + * + * tb tree_balance structure: + * + * tb is a large structure that must be read about in the header file + * at the same time as this procedure if the reader is to successfully + * understand this procedure + * + * h current level of the node; + * inum item number in S[h]; + * mode i - insert, p - paste, d - delete, c - cut. + * Returns: 1 - schedule occured; + * 0 - balancing for higher levels needed; + * -1 - no balancing for higher levels needed; + * -2 - no disk space. + */ +static int check_balance (int mode, + struct tree_balance * tb, + int h, + int inum, + int pos_in_item, + struct item_head * ins_ih, + const void * data + ) +{ + struct virtual_node * vn; + + vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf); + vn->vn_free_ptr = (char *)(tb->tb_vn + 1); + vn->vn_mode = mode; + vn->vn_affected_item_num = inum; + vn->vn_pos_in_item = pos_in_item; + vn->vn_ins_ih = ins_ih; + vn->vn_data = data; + +#ifdef CONFIG_REISERFS_CHECK + if (mode == M_INSERT && !vn->vn_ins_ih) + reiserfs_panic (0, "vs-8255: check_balance: ins_ih can not be 0 in insert mode"); +#endif + + if ( tb->insert_size[h] > 0 ) + /* Calculate balance parameters when size of node is increasing. */ + return ip_check_balance (tb, h); + + /* Calculate balance parameters when size of node is decreasing. */ + return dc_check_balance (tb, h); +} + + + +/* Check whether parent at the path is the really parent of the current node.*/ +static int get_direct_parent( + struct tree_balance * p_s_tb, + int n_h + ) { + struct buffer_head * p_s_bh; + struct path * p_s_path = p_s_tb->tb_path; + int n_position, + n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h); + + /* We are in the root or in the new root. */ + if ( n_path_offset <= FIRST_PATH_ELEMENT_OFFSET ) { + +#ifdef CONFIG_REISERFS_CHECK + if ( n_path_offset < FIRST_PATH_ELEMENT_OFFSET - 1 ) + reiserfs_panic(p_s_tb->tb_sb, "PAP-8260: get_direct_parent: illegal offset in the path"); +#endif + + if ( PATH_OFFSET_PBUFFER(p_s_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == + SB_ROOT_BLOCK (p_s_tb->tb_sb) ) { + /* Root is not changed. */ + PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1) = NULL; + PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1) = 0; + return CARRY_ON; + } + return REPEAT_SEARCH; /* Root is changed and we must recalculate the path. */ + } + + if ( ! B_IS_IN_TREE(p_s_bh = PATH_OFFSET_PBUFFER(p_s_path, n_path_offset - 1)) ) + return REPEAT_SEARCH; /* Parent in the path is not in the tree. */ + + if ( (n_position = PATH_OFFSET_POSITION(p_s_path, n_path_offset - 1)) > B_NR_ITEMS(p_s_bh) ) + return REPEAT_SEARCH; + + if ( B_N_CHILD_NUM(p_s_bh, n_position) != PATH_OFFSET_PBUFFER(p_s_path, n_path_offset)->b_blocknr ) + /* Parent in the path is not parent of the current node in the tree. */ + return REPEAT_SEARCH; + + if ( buffer_locked(p_s_bh) ) { + __wait_on_buffer(p_s_bh); + if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) + return REPEAT_SEARCH; + } + + return CARRY_ON; /* Parent in the path is unlocked and really parent of the current node. */ +} + + +/* Using lnum[n_h] and rnum[n_h] we should determine what neighbors + * of S[n_h] we + * need in order to balance S[n_h], and get them if necessary. + * Returns: SCHEDULE_OCCURRED - schedule occured while the function worked; + * CARRY_ON - schedule didn't occur while the function worked; + */ +static int get_neighbors( + struct tree_balance * p_s_tb, + int n_h + ) { + int n_child_position, + n_path_offset = PATH_H_PATH_OFFSET(p_s_tb->tb_path, n_h + 1); + unsigned long n_son_number; + struct super_block * p_s_sb = p_s_tb->tb_sb; + struct buffer_head * p_s_bh; + + + if ( p_s_tb->lnum[n_h] ) { + /* We need left neighbor to balance S[n_h]. */ + p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset); + +#ifdef CONFIG_REISERFS_CHECK + if ( p_s_bh == p_s_tb->FL[n_h] && ! PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset) ) + reiserfs_panic (p_s_tb->tb_sb, "PAP-8270: get_neighbors: invalid position in the parent"); +#endif + + n_child_position = ( p_s_bh == p_s_tb->FL[n_h] ) ? p_s_tb->lkey[n_h] : B_NR_ITEMS (p_s_tb->FL[n_h]); + n_son_number = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position); + p_s_bh = reiserfs_bread(p_s_sb->s_dev, n_son_number, p_s_sb->s_blocksize); + if (!p_s_bh) + return IO_ERROR; + if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { + decrement_bcount(p_s_bh); + return REPEAT_SEARCH; + } + +#ifdef CONFIG_REISERFS_CHECK + if ( ! B_IS_IN_TREE(p_s_tb->FL[n_h]) || n_child_position > B_NR_ITEMS(p_s_tb->FL[n_h]) || + B_N_CHILD_NUM(p_s_tb->FL[n_h], n_child_position) != p_s_bh->b_blocknr ) + reiserfs_panic (p_s_tb->tb_sb, "PAP-8275: get_neighbors: invalid parent"); + if ( ! B_IS_IN_TREE(p_s_bh) ) + reiserfs_panic (p_s_tb->tb_sb, "PAP-8280: get_neighbors: invalid child"); + + if (! n_h && B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - B_N_CHILD (p_s_tb->FL[0],n_child_position)->dc_size) + reiserfs_panic (p_s_tb->tb_sb, "PAP-8290: get_neighbors: invalid child size of left neighbor"); +#endif + + decrement_bcount(p_s_tb->L[n_h]); + p_s_tb->L[n_h] = p_s_bh; + } + + + if ( p_s_tb->rnum[n_h] ) { /* We need right neighbor to balance S[n_path_offset]. */ + p_s_bh = PATH_OFFSET_PBUFFER(p_s_tb->tb_path, n_path_offset); + +#ifdef CONFIG_REISERFS_CHECK + if ( p_s_bh == p_s_tb->FR[n_h] && PATH_OFFSET_POSITION(p_s_tb->tb_path, n_path_offset) >= B_NR_ITEMS(p_s_bh) ) + reiserfs_panic (p_s_tb->tb_sb, "PAP-8295: get_neighbors: invalid position in the parent"); +#endif + + n_child_position = ( p_s_bh == p_s_tb->FR[n_h] ) ? p_s_tb->rkey[n_h] + 1 : 0; + n_son_number = B_N_CHILD_NUM(p_s_tb->FR[n_h], n_child_position); + p_s_bh = reiserfs_bread(p_s_sb->s_dev, n_son_number, p_s_sb->s_blocksize); + if (!p_s_bh) + return IO_ERROR; + if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { + decrement_bcount(p_s_bh); + return REPEAT_SEARCH; + } + decrement_bcount(p_s_tb->R[n_h]); + p_s_tb->R[n_h] = p_s_bh; + +#ifdef CONFIG_REISERFS_CHECK + if (! n_h && B_FREE_SPACE (p_s_bh) != MAX_CHILD_SIZE (p_s_bh) - B_N_CHILD (p_s_tb->FR[0],n_child_position)->dc_size) { + reiserfs_panic (p_s_tb->tb_sb, "PAP-8300: get_neighbors: invalid child size of right neighbor (%d != %d - %d)", + B_FREE_SPACE (p_s_bh), MAX_CHILD_SIZE (p_s_bh), B_N_CHILD (p_s_tb->FR[0],n_child_position)->dc_size); + } +#endif + + } + return CARRY_ON; +} + + +void * reiserfs_kmalloc (size_t size, int flags, struct super_block * s) +{ + void * vp; + static size_t malloced; + + + vp = kmalloc (size, flags); + if (vp) { + s->u.reiserfs_sb.s_kmallocs += size; + if (s->u.reiserfs_sb.s_kmallocs > malloced + 200000) { + reiserfs_warning ("vs-8301: reiserfs_kmalloc: allocated memory %d\n", s->u.reiserfs_sb.s_kmallocs); + malloced = s->u.reiserfs_sb.s_kmallocs; + } + } +/*printk ("malloc : size %d, allocated %d\n", size, s->u.reiserfs_sb.s_kmallocs);*/ + return vp; +} + +void reiserfs_kfree (const void * vp, size_t size, struct super_block * s) +{ + kfree (vp); + + s->u.reiserfs_sb.s_kmallocs -= size; + if (s->u.reiserfs_sb.s_kmallocs < 0) + reiserfs_warning ("vs-8302: reiserfs_kfree: allocated memory %d\n", s->u.reiserfs_sb.s_kmallocs); + +} + + +static int get_virtual_node_size (struct super_block * sb, struct buffer_head * bh) +{ + // int size = sizeof (struct virtual_item); /* for new item in case of insert */ + // int i, nr_items; + // struct item_head * ih; + + // this is enough for _ALL_ currently possible cases. In 4 k block + // one may put < 170 empty items. Each virtual item eats 12 + // byte. The biggest direntry item may have < 256 entries. Each + // entry would eat 2 byte of virtual node space + return sb->s_blocksize; + +#if 0 + size = sizeof (struct virtual_node) + sizeof (struct virtual_item); + ih = B_N_PITEM_HEAD (bh, 0); + nr_items = B_NR_ITEMS (bh); + for (i = 0; i < nr_items; i ++, ih ++) { + /* each item occupies some space in virtual node */ + size += sizeof (struct virtual_item); + if (is_direntry_le_ih (ih)) + /* each entry and new one occupeis 2 byte in the virtual node */ + size += (le16_to_cpu (ih->u.ih_entry_count) + 1) * sizeof (__u16); + } + + /* 1 bit for each bitmap block to note whether bitmap block was + dirtied in the operation */ + /* size += (SB_BMAP_NR (sb) * 2 / 8 + 4);*/ + return size; +#endif +} + + + +/* maybe we should fail balancing we are going to perform when kmalloc + fails several times. But now it will loop until kmalloc gets + required memory */ +static int get_mem_for_virtual_node (struct tree_balance * tb) +{ + int check_fs = 0; + int size; + char * buf; + + size = get_virtual_node_size (tb->tb_sb, PATH_PLAST_BUFFER (tb->tb_path)); + + if (size > tb->vn_buf_size) { + /* we have to allocate more memory for virtual node */ + if (tb->vn_buf) { + /* free memory allocated before */ + reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb); + /* this is not needed if kfree is atomic */ + check_fs = 1; + } + + /* virtual node requires now more memory */ + tb->vn_buf_size = size; + + /* get memory for virtual item */ + buf = reiserfs_kmalloc(size, GFP_ATOMIC, tb->tb_sb); + if ( ! buf ) { + /* getting memory with GFP_KERNEL priority may involve + balancing now (due to indirect_to_direct conversion on + dcache shrinking). So, release path and collected + resourses here */ + free_buffers_in_tb (tb); + buf = reiserfs_kmalloc(size, GFP_BUFFER, tb->tb_sb); + if ( !buf ) { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning ("vs-8345: get_mem_for_virtual_node: " + "kmalloc failed. reiserfs kmalloced %d bytes\n", + tb->tb_sb->u.reiserfs_sb.s_kmallocs); +#endif + tb->vn_buf_size = 0; + } + tb->vn_buf = buf; + schedule() ; + return REPEAT_SEARCH; + } + + tb->vn_buf = buf; + } + + if ( check_fs && FILESYSTEM_CHANGED_TB (tb) ) + return REPEAT_SEARCH; + + return CARRY_ON; +} + + +#ifdef CONFIG_REISERFS_CHECK +static void tb_buffer_sanity_check (struct super_block * p_s_sb, + struct buffer_head * p_s_bh, + const char *descr, int level) { + if (p_s_bh) { + if (atomic_read (&(p_s_bh->b_count)) <= 0) { + + reiserfs_panic (p_s_sb, "tb_buffer_sanity_check(): negative or zero reference counter for buffer %s[%d] (%b)\n", descr, level, p_s_bh); + } + + if ( ! buffer_uptodate (p_s_bh) ) { + reiserfs_panic (p_s_sb, "tb_buffer_sanity_check(): buffer is not up to date %s[%d] (%b)\n", descr, level, p_s_bh); + } + + if ( ! B_IS_IN_TREE (p_s_bh) ) { + reiserfs_panic (p_s_sb, "tb_buffer_sanity_check(): buffer is not in tree %s[%d] (%b)\n", descr, level, p_s_bh); + } + + if (p_s_bh->b_dev != p_s_sb->s_dev || + p_s_bh->b_size != p_s_sb->s_blocksize || + p_s_bh->b_blocknr > SB_BLOCK_COUNT(p_s_sb)) { + reiserfs_panic (p_s_sb, "tb_buffer_sanity_check(): check failed for buffer %s[%d] (%b)\n", descr, level, p_s_bh); + } + } +} +#endif + +static void clear_all_dirty_bits(struct super_block *s, + struct buffer_head *bh) { + reiserfs_prepare_for_journal(s, bh, 0) ; +} + +static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb) +{ + struct buffer_head * locked; +#ifdef CONFIG_REISERFS_CHECK + int repeat_counter = 0; +#endif + int i; + + do { + + locked = NULL; + + for ( i = p_s_tb->tb_path->path_length; !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i-- ) { + if ( PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i) ) { + /* if I understand correctly, we can only be sure the last buffer + ** in the path is in the tree --clm + */ +#ifdef CONFIG_REISERFS_CHECK + if (PATH_PLAST_BUFFER(p_s_tb->tb_path) == + PATH_OFFSET_PBUFFER(p_s_tb->tb_path, i)) { + tb_buffer_sanity_check (p_s_tb->tb_sb, + PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i), + "S", + p_s_tb->tb_path->path_length - i); + } +#endif + clear_all_dirty_bits(p_s_tb->tb_sb, + PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)) ; + + if ( buffer_locked (PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)) ) + locked = PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i); + } + } + + for ( i = 0; !locked && i < MAX_HEIGHT && p_s_tb->insert_size[i]; i++ ) { + + if (p_s_tb->lnum[i] ) { + + if ( p_s_tb->L[i] ) { +#ifdef CONFIG_REISERFS_CHECK + tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->L[i], "L", i); +#endif + clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i]) ; + if ( buffer_locked (p_s_tb->L[i]) ) + locked = p_s_tb->L[i]; + } + + if ( !locked && p_s_tb->FL[i] ) { +#ifdef CONFIG_REISERFS_CHECK + tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FL[i], "FL", i); +#endif + clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i]) ; + if ( buffer_locked (p_s_tb->FL[i]) ) + locked = p_s_tb->FL[i]; + } + + if ( !locked && p_s_tb->CFL[i] ) { +#ifdef CONFIG_REISERFS_CHECK + tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFL[i], "CFL", i); +#endif + clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i]) ; + if ( buffer_locked (p_s_tb->CFL[i]) ) + locked = p_s_tb->CFL[i]; + } + + } + + if ( !locked && (p_s_tb->rnum[i]) ) { + + if ( p_s_tb->R[i] ) { +#ifdef CONFIG_REISERFS_CHECK + tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->R[i], "R", i); +#endif + clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i]) ; + if ( buffer_locked (p_s_tb->R[i]) ) + locked = p_s_tb->R[i]; + } + + + if ( !locked && p_s_tb->FR[i] ) { +#ifdef CONFIG_REISERFS_CHECK + tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FR[i], "FR", i); +#endif + clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i]) ; + if ( buffer_locked (p_s_tb->FR[i]) ) + locked = p_s_tb->FR[i]; + } + + if ( !locked && p_s_tb->CFR[i] ) { +#ifdef CONFIG_REISERFS_CHECK + tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFR[i], "CFR", i); +#endif + clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i]) ; + if ( buffer_locked (p_s_tb->CFR[i]) ) + locked = p_s_tb->CFR[i]; + } + } + } + /* as far as I can tell, this is not required. The FEB list seems + ** to be full of newly allocated nodes, which will never be locked, + ** dirty, or anything else. + ** To be safe, I'm putting in the checks and waits in. For the moment, + ** they are needed to keep the code in journal.c from complaining + ** about the buffer. That code is inside CONFIG_REISERFS_CHECK as well. + ** --clm + */ + for ( i = 0; !locked && i < MAX_FEB_SIZE; i++ ) { + if ( p_s_tb->FEB[i] ) { + clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i]) ; + if (buffer_locked(p_s_tb->FEB[i])) { + locked = p_s_tb->FEB[i] ; + } + } + } + + if (locked) { +#ifdef CONFIG_REISERFS_CHECK + repeat_counter++; + if ( (repeat_counter % 10000) == 0) { + reiserfs_warning ("wait_tb_buffers_until_released(): too many iterations waiting for buffer to unlock (%b)\n", locked); + + /* Don't loop forever. Try to recover from possible error. */ + + return ( FILESYSTEM_CHANGED_TB (p_s_tb) ) ? REPEAT_SEARCH : CARRY_ON; + } +#endif + __wait_on_buffer (locked); + if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) { + return REPEAT_SEARCH; + } + } + + } while (locked); + + return CARRY_ON; +} + + +/* Prepare for balancing, that is + * get all necessary parents, and neighbors; + * analyze what and where should be moved; + * get sufficient number of new nodes; + * Balancing will start only after all resources will be collected at a time. + * + * When ported to SMP kernels, only at the last moment after all needed nodes + * are collected in cache, will the resources be locked using the usual + * textbook ordered lock acquisition algorithms. Note that ensuring that + * this code neither write locks what it does not need to write lock nor locks out of order + * will be a pain in the butt that could have been avoided. Grumble grumble. -Hans + * + * fix is meant in the sense of render unchanging + * + * Latency might be improved by first gathering a list of what buffers are needed + * and then getting as many of them in parallel as possible? -Hans + * + * Parameters: + * op_mode i - insert, d - delete, c - cut (truncate), p - paste (append) + * tb tree_balance structure; + * inum item number in S[h]; + * pos_in_item - comment this if you can + * ins_ih & ins_sd are used when inserting + * Returns: 1 - schedule occurred while the function worked; + * 0 - schedule didn't occur while the function worked; + * -1 - if no_disk_space + */ + + +int fix_nodes (int n_op_mode, + struct tree_balance * p_s_tb, + struct item_head * p_s_ins_ih, // item head of item being inserted + const void * data // inserted item or data to be pasted + ) { + int n_ret_value, + n_h, + n_item_num = PATH_LAST_POSITION(p_s_tb->tb_path); + int n_pos_in_item; + + /* we set wait_tb_buffers_run when we have to restore any dirty bits cleared + ** during wait_tb_buffers_run + */ + int wait_tb_buffers_run = 0 ; + int windex ; + struct buffer_head * p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path); + + n_pos_in_item = p_s_tb->tb_path->pos_in_item; + + + p_s_tb->fs_gen = get_generation (p_s_tb->tb_sb); + + /* we prepare and log the super here so it will already be in the + ** transaction when do_balance needs to change it. + ** This way do_balance won't have to schedule when trying to prepare + ** the super for logging + */ + reiserfs_prepare_for_journal(p_s_tb->tb_sb, + SB_BUFFER_WITH_SB(p_s_tb->tb_sb), 1) ; + journal_mark_dirty(p_s_tb->transaction_handle, p_s_tb->tb_sb, + SB_BUFFER_WITH_SB(p_s_tb->tb_sb)) ; + if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) + return REPEAT_SEARCH; + + /* if it possible in indirect_to_direct conversion */ + if (buffer_locked (p_s_tbS0)) { + __wait_on_buffer (p_s_tbS0); + if ( FILESYSTEM_CHANGED_TB (p_s_tb) ) + return REPEAT_SEARCH; + } + +#ifndef __KERNEL__ + if ( atomic_read (&(p_s_tbS0->b_count)) > 1 || + (p_s_tb->L[0] && atomic_read (&(p_s_tb->L[0]->b_count)) > 1) || + (p_s_tb->R[0] && atomic_read (&(p_s_tb->R[0]->b_count)) > 1) ) { + printk ("mode=%c, insert_size=%d\n", n_op_mode, p_s_tb->insert_size[0]); + print_cur_tb ("first three parameters are invalid"); + reiserfs_panic (p_s_tb->tb_sb, "PAP-8310: fix_nodes: all buffers must be hold once in one thread processing"); + } +#endif + +#ifdef CONFIG_REISERFS_CHECK + if ( cur_tb ) { + print_cur_tb ("fix_nodes"); + reiserfs_panic(p_s_tb->tb_sb,"PAP-8305: fix_nodes: there is pending do_balance"); + } + + if (!buffer_uptodate (p_s_tbS0) || !B_IS_IN_TREE (p_s_tbS0)) { + reiserfs_panic (p_s_tb->tb_sb, "PAP-8320: fix_nodes: S[0] (%b %z) is not uptodate " + "at the beginning of fix_nodes or not in tree (mode %c)", p_s_tbS0, p_s_tbS0, n_op_mode); + } + + // FIXME: new items have to be of 8 byte multiples. Including new + // directory items those look like old ones + /* + if (p_s_tb->insert_size[0] % 8) + reiserfs_panic (p_s_tb->tb_sb, "vs-: fix_nodes: incorrect insert_size %d, " + "mode %c", + p_s_tb->insert_size[0], n_op_mode); + */ + + /* Check parameters. */ + switch (n_op_mode) { +#ifdef REISERFS_FSCK + case M_INTERNAL: + break; + case M_INSERT: + if ( n_item_num < 0 || n_item_num > B_NR_ITEMS(p_s_tbS0) ) + reiserfs_panic(p_s_tb->tb_sb,"PAP-8325: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert", + n_item_num, B_NR_ITEMS(p_s_tbS0)); +#else + case M_INSERT: + if ( n_item_num <= 0 || n_item_num > B_NR_ITEMS(p_s_tbS0) ) + reiserfs_panic(p_s_tb->tb_sb,"PAP-8330: fix_nodes: Incorrect item number %d (in S0 - %d) in case of insert", + n_item_num, B_NR_ITEMS(p_s_tbS0)); +#endif + break; + case M_PASTE: + case M_DELETE: + case M_CUT: + if ( n_item_num < 0 || n_item_num >= B_NR_ITEMS(p_s_tbS0) ) { + print_block (p_s_tbS0, 0, -1, -1); + printk("mode = %c insert_size = %d\n", n_op_mode, p_s_tb->insert_size[0]); + reiserfs_panic(p_s_tb->tb_sb,"PAP-8335: fix_nodes: Incorrect item number(%d)", n_item_num); + } + break; + default: + reiserfs_panic(p_s_tb->tb_sb,"PAP-8340: fix_nodes: Incorrect mode of operation"); + } +#endif + + if (get_mem_for_virtual_node (p_s_tb) == REPEAT_SEARCH) + // FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat + return REPEAT_SEARCH; + + + /* Starting from the leaf level; for all levels n_h of the tree. */ + for ( n_h = 0; n_h < MAX_HEIGHT && p_s_tb->insert_size[n_h]; n_h++ ) { + if ( (n_ret_value = get_direct_parent(p_s_tb, n_h)) != CARRY_ON ) { + goto repeat; + return n_ret_value; + } + + if ( (n_ret_value = check_balance (n_op_mode, p_s_tb, n_h, n_item_num, + n_pos_in_item, p_s_ins_ih, data)) != CARRY_ON ) { + if ( n_ret_value == NO_BALANCING_NEEDED ) { + /* No balancing for higher levels needed. */ + if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) { + goto repeat; + return n_ret_value; + } + if ( n_h != MAX_HEIGHT - 1 ) + p_s_tb->insert_size[n_h + 1] = 0; + /* ok, analysis and resource gathering are complete */ + break; + } + goto repeat; + return n_ret_value; + } + + if ( (n_ret_value = get_neighbors(p_s_tb, n_h)) != CARRY_ON ) { + goto repeat; + return n_ret_value; + } + + if ( (n_ret_value = get_empty_nodes(p_s_tb, n_h)) != CARRY_ON ) { + goto repeat; + return n_ret_value; /* No disk space, or schedule occurred and + analysis may be invalid and needs to be redone. */ + } + + if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h) ) { + /* We have a positive insert size but no nodes exist on this + level, this means that we are creating a new root. */ + +#ifdef CONFIG_REISERFS_CHECK + if ( p_s_tb->blknum[n_h] != 1 ) + reiserfs_panic(p_s_tb->tb_sb,"PAP-8350: fix_nodes: creating new empty root"); +#endif /* CONFIG_REISERFS_CHECK */ + + if ( n_h < MAX_HEIGHT - 1 ) + p_s_tb->insert_size[n_h + 1] = 0; + } + else + if ( ! PATH_H_PBUFFER(p_s_tb->tb_path, n_h + 1) ) { + if ( p_s_tb->blknum[n_h] > 1 ) { + /* The tree needs to be grown, so this node S[n_h] + which is the root node is split into two nodes, + and a new node (S[n_h+1]) will be created to + become the root node. */ + +#ifdef CONFIG_REISERFS_CHECK + if ( n_h == MAX_HEIGHT - 1 ) + reiserfs_panic(p_s_tb->tb_sb, "PAP-8355: fix_nodes: attempt to create too high of a tree"); +#endif /* CONFIG_REISERFS_CHECK */ + + p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1) + DC_SIZE; + } + else + if ( n_h < MAX_HEIGHT - 1 ) + p_s_tb->insert_size[n_h + 1] = 0; + } + else + p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1); + } + + + windex = push_journal_writer("fix_nodes") ; + if ((n_ret_value = wait_tb_buffers_until_unlocked (p_s_tb)) == CARRY_ON) { + pop_journal_writer(windex) ; + if (FILESYSTEM_CHANGED_TB(p_s_tb)) { + wait_tb_buffers_run = 1 ; + n_ret_value = REPEAT_SEARCH ; + goto repeat; + } else { + return CARRY_ON; + } + } else { + wait_tb_buffers_run = 1 ; + pop_journal_writer(windex) ; + goto repeat; + } + + repeat: + // fix_nodes was unable to perform its calculation due to + // filesystem got changed under us, lack of free disk space or i/o + // failure. If the first is the case - the search will be + // repeated. For now - free all resources acquired so far except + // for the new allocated nodes + { + int i; + + /* Release path buffers. */ + if (wait_tb_buffers_run) { + pathrelse_and_restore(p_s_tb->tb_sb, p_s_tb->tb_path) ; + } else { + pathrelse (p_s_tb->tb_path); + } + /* brelse all resources collected for balancing */ + for ( i = 0; i < MAX_HEIGHT; i++ ) { + if (wait_tb_buffers_run) { + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->L[i]); + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->R[i]); + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FL[i]); + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->FR[i]); + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFL[i]); + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, p_s_tb->CFR[i]); + } + + brelse (p_s_tb->L[i]);p_s_tb->L[i] = 0; + brelse (p_s_tb->R[i]);p_s_tb->R[i] = 0; + brelse (p_s_tb->FL[i]);p_s_tb->FL[i] = 0; + brelse (p_s_tb->FR[i]);p_s_tb->FR[i] = 0; + brelse (p_s_tb->CFL[i]);p_s_tb->CFL[i] = 0; + brelse (p_s_tb->CFR[i]);p_s_tb->CFR[i] = 0; + } + + if (wait_tb_buffers_run) { + for ( i = 0; i < MAX_FEB_SIZE; i++ ) { + if ( p_s_tb->FEB[i] ) { + reiserfs_restore_prepared_buffer(p_s_tb->tb_sb, + p_s_tb->FEB[i]) ; + } + } + } + return n_ret_value; + } + +} + + +/* Anatoly will probably forgive me renaming p_s_tb to tb. I just + wanted to make lines shorter */ +void unfix_nodes (struct tree_balance * tb) +{ + int i; + +#ifdef CONFIG_REISERFS_CHECK + if ( ! tb->vn_buf ) + reiserfs_panic (tb->tb_sb, + "PAP-16050: unfix_nodes: pointer to the virtual node is NULL"); +#endif + + /* Release path buffers. */ + pathrelse_and_restore (tb->tb_sb, tb->tb_path); + + /* brelse all resources collected for balancing */ + for ( i = 0; i < MAX_HEIGHT; i++ ) { + reiserfs_restore_prepared_buffer (tb->tb_sb, tb->L[i]); + reiserfs_restore_prepared_buffer (tb->tb_sb, tb->R[i]); + reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FL[i]); + reiserfs_restore_prepared_buffer (tb->tb_sb, tb->FR[i]); + reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFL[i]); + reiserfs_restore_prepared_buffer (tb->tb_sb, tb->CFR[i]); + + brelse (tb->L[i]); + brelse (tb->R[i]); + brelse (tb->FL[i]); + brelse (tb->FR[i]); + brelse (tb->CFL[i]); + brelse (tb->CFR[i]); + } + + /* deal with list of allocated (used and unused) nodes */ + for ( i = 0; i < MAX_FEB_SIZE; i++ ) { + if ( tb->FEB[i] ) { + unsigned long blocknr = tb->FEB[i]->b_blocknr ; + /* de-allocated block which was not used by balancing and + bforget about buffer for it */ + brelse (tb->FEB[i]); + reiserfs_free_block (tb->transaction_handle, blocknr); + } + if (tb->used[i]) { + /* release used as new nodes including a new root */ + brelse (tb->used[i]); + } + } + +#if 0 /* shouldn't this be in CONFIG_REISERFS_CHECK??? */ + /* make sure, that all we have released got really freed */ + for (i = 0; i < sizeof (tb->thrown) / sizeof (tb->thrown[0]); i ++) + if (tb->thrown[i]) { + if (atomic_read (&(tb->thrown[i]->b_count))) { + /* the log will have the count at one and the buffers marked */ + if (atomic_read(&(tb->thrown[i]->b_count)) > 1 || + !(buffer_journaled(tb->thrown[i]) || + buffer_journal_dirty(tb->thrown[i]))) { + foo_print (tb->thrown[i], tb->tb_sb); + printk ("unfix_nodes: Waiting...(block %lu, count %d)\n", + tb->thrown[i]->b_blocknr, + atomic_read (&(tb->thrown[i]->b_count))); + wait_buffer_until_released (tb->thrown[i]); + printk ("unfix_nodes: Done (block %lu, count %d)\n", + tb->thrown[i]->b_blocknr, + atomic_read (&(tb->thrown[i]->b_count))); + } + } + } +#endif /* 0 */ + reiserfs_kfree (tb->vn_buf, tb->vn_buf_size, tb->tb_sb); + +} + + + +#ifndef REISERFS_FSCK + +// is_left_mergeable is now one of the item methods + +#else + +// this works only in fsck + +int are_items_mergeable (struct item_head * left, struct item_head * right, int bsize) +{ + if (comp_keys (&left->ih_key, &right->ih_key) != -1) { + reiserfs_panic (0, "vs-16070: are_items_mergeable: left %k, right %k", &(left->ih_key), &(right->ih_key)); + } + + if (comp_short_keys (&left->ih_key, &right->ih_key)) + return 0; + + if (I_IS_DIRECTORY_ITEM (left)) { + return 1; + } + + if ((I_IS_DIRECT_ITEM (left) && I_IS_DIRECT_ITEM (right)) || + (I_IS_INDIRECT_ITEM (left) && I_IS_INDIRECT_ITEM (right))) + return (left->ih_key.k_offset + I_BYTES_NUMBER (left, bsize) == right->ih_key.k_offset) ? 1 : 0; + + return 0; +} + +/* get left neighbor of the leaf node */ +static struct buffer_head * get_left_neighbor (struct super_block * s, struct path * path) +{ + struct key key; + INITIALIZE_PATH (path_to_left_neighbor); + struct buffer_head * bh; + + copy_key (&key, B_N_PKEY (PATH_PLAST_BUFFER (path), 0)); + decrement_key (&key); + +/* init_path (&path_to_left_neighbor);*/ + search_by_key (s, &key, &path_to_left_neighbor, DISK_LEAF_NODE_LEVEL, READ_BLOCKS); + // FIXME: fsck is to handle I/O failures somehow as well + if (PATH_LAST_POSITION (&path_to_left_neighbor) == 0) { + pathrelse (&path_to_left_neighbor); + return 0; + } + bh = PATH_PLAST_BUFFER (&path_to_left_neighbor); + bh->b_count ++; + pathrelse (&path_to_left_neighbor); + return bh; +} + +extern struct key MIN_KEY; +static struct buffer_head * get_right_neighbor (struct super_block * s, struct path * path) +{ + struct key key; + struct key * rkey; + INITIALIZE_PATH (path_to_right_neighbor); + struct buffer_head * bh; + + rkey = get_rkey (path, s); + if (comp_keys (rkey, &MIN_KEY) == 0) + reiserfs_panic (s, "vs-16080: get_right_neighbor: get_rkey returned min key (path has changed)"); + copy_key (&key, rkey); + + + /*init_path (&path_to_right_neighbor);*/ + search_by_key (s, &key, &path_to_right_neighbor, DISK_LEAF_NODE_LEVEL, READ_BLOCKS); + if (PATH_PLAST_BUFFER (&path_to_right_neighbor) == PATH_PLAST_BUFFER (path)) { + pathrelse (&path_to_right_neighbor); + return 0; + } + bh = PATH_PLAST_BUFFER (&path_to_right_neighbor); + bh->b_count ++; + pathrelse (&path_to_right_neighbor); + return bh; +} + + +int is_left_mergeable (struct super_block * s, struct path * path) +{ + struct item_head * right; + struct buffer_head * bh; + int retval; + + right = B_N_PITEM_HEAD (PATH_PLAST_BUFFER (path), 0); + + bh = get_left_neighbor (s, path); + if (bh == 0) { + return 0; + } + retval = are_items_mergeable (B_N_PITEM_HEAD (bh, B_NR_ITEMS (bh) - 1), right, bh->b_size); + brelse (bh); + return retval; +} + + +int is_right_mergeable (struct super_block * s, struct path * path) +{ + struct item_head * left; + struct buffer_head * bh; + int retval; + + left = B_N_PITEM_HEAD (PATH_PLAST_BUFFER (path), B_NR_ITEMS (PATH_PLAST_BUFFER (path)) - 1); + + bh = get_right_neighbor (s, path); + if (bh == 0) { + return 0; + } + retval = are_items_mergeable (left, B_N_PITEM_HEAD (bh, 0), bh->b_size); + brelse (bh); + return retval; +} + +#endif /* REISERFS_FSCK */ + + + + + diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/hashes.c linux/fs/reiserfs/hashes.c --- v2.4.0/linux/fs/reiserfs/hashes.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/hashes.c Mon Jan 15 12:42:32 2001 @@ -0,0 +1,226 @@ + +/* + * Keyed 32-bit hash function using TEA in a Davis-Meyer function + * H0 = Key + * Hi = E Mi(Hi-1) + Hi-1 + * + * (see Applied Cryptography, 2nd edition, p448). + * + * Jeremy Fitzhardinge 1998 + * + * Jeremy has agreed to the contents of reiserfs/README. -Hans + * Yura's function is added (04/07/2000) + */ + +// +// keyed_hash +// yura_hash +// r5_hash +// + +#include + + + +#define DELTA 0x9E3779B9 +#define FULLROUNDS 10 /* 32 is overkill, 16 is strong crypto */ +#define PARTROUNDS 6 /* 6 gets complete mixing */ + +#ifndef __KERNEL__ +typedef __u32 u32; +#endif + +/* a, b, c, d - data; h0, h1 - accumulated hash */ +#define TEACORE(rounds) \ + do { \ + u32 sum = 0; \ + int n = rounds; \ + u32 b0, b1; \ + \ + b0 = h0; \ + b1 = h1; \ + \ + do \ + { \ + sum += DELTA; \ + b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); \ + b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); \ + } while(--n); \ + \ + h0 += b0; \ + h1 += b1; \ + } while(0) + + +u32 keyed_hash(const char *msg, int len) +{ + u32 k[] = { 0x9464a485, 0x542e1a94, 0x3e846bff, 0xb75bcfc3}; + + u32 h0 = k[0], h1 = k[1]; + u32 a, b, c, d; + u32 pad; + int i; + + + // assert(len >= 0 && len < 256); + + pad = (u32)len | ((u32)len << 8); + pad |= pad << 16; + + while(len >= 16) + { + a = (u32)msg[ 0] | + (u32)msg[ 1] << 8 | + (u32)msg[ 2] << 16| + (u32)msg[ 3] << 24; + b = (u32)msg[ 4] | + (u32)msg[ 5] << 8 | + (u32)msg[ 6] << 16| + (u32)msg[ 7] << 24; + c = (u32)msg[ 8] | + (u32)msg[ 9] << 8 | + (u32)msg[10] << 16| + (u32)msg[11] << 24; + d = (u32)msg[12] | + (u32)msg[13] << 8 | + (u32)msg[14] << 16| + (u32)msg[15] << 24; + + TEACORE(PARTROUNDS); + + len -= 16; + msg += 16; + } + + if (len >= 12) + { + //assert(len < 16); + if (len >= 16) + *(int *)0 = 0; + + a = (u32)msg[ 0] | + (u32)msg[ 1] << 8 | + (u32)msg[ 2] << 16| + (u32)msg[ 3] << 24; + b = (u32)msg[ 4] | + (u32)msg[ 5] << 8 | + (u32)msg[ 6] << 16| + (u32)msg[ 7] << 24; + c = (u32)msg[ 8] | + (u32)msg[ 9] << 8 | + (u32)msg[10] << 16| + (u32)msg[11] << 24; + + d = pad; + for(i = 12; i < len; i++) + { + d <<= 8; + d |= msg[i]; + } + } + else if (len >= 8) + { + //assert(len < 12); + if (len >= 12) + *(int *)0 = 0; + a = (u32)msg[ 0] | + (u32)msg[ 1] << 8 | + (u32)msg[ 2] << 16| + (u32)msg[ 3] << 24; + b = (u32)msg[ 4] | + (u32)msg[ 5] << 8 | + (u32)msg[ 6] << 16| + (u32)msg[ 7] << 24; + + c = d = pad; + for(i = 8; i < len; i++) + { + c <<= 8; + c |= msg[i]; + } + } + else if (len >= 4) + { + //assert(len < 8); + if (len >= 8) + *(int *)0 = 0; + a = (u32)msg[ 0] | + (u32)msg[ 1] << 8 | + (u32)msg[ 2] << 16| + (u32)msg[ 3] << 24; + + b = c = d = pad; + for(i = 4; i < len; i++) + { + b <<= 8; + b |= msg[i]; + } + } + else + { + //assert(len < 4); + if (len >= 4) + *(int *)0 = 0; + a = b = c = d = pad; + for(i = 0; i < len; i++) + { + a <<= 8; + a |= msg[i]; + } + } + + TEACORE(FULLROUNDS); + +/* return 0;*/ + return h0^h1; +} + +/* What follows in this file is copyright 2000 by Hans Reiser, and the + * licensing of what follows is governed by reiserfs/README */ + +u32 yura_hash (const char *msg, int len) +{ + int j, pow; + u32 a, c; + int i; + + for (pow=1,i=1; i < len; i++) pow = pow * 10; + + if (len == 1) + a = msg[0]-48; + else + a = (msg[0] - 48) * pow; + + for (i=1; i < len; i++) { + c = msg[i] - 48; + for (pow=1,j=i; j < len-1; j++) pow = pow * 10; + a = a + c * pow; + } + + for (; i < 40; i++) { + c = '0' - 48; + for (pow=1,j=i; j < len-1; j++) pow = pow * 10; + a = a + c * pow; + } + + for (; i < 256; i++) { + c = i; + for (pow=1,j=i; j < len-1; j++) pow = pow * 10; + a = a + c * pow; + } + + a = a << 7; + return a; +} + +u32 r5_hash (const char *msg, int len) +{ + u32 a=0; + while(*msg) { + a += *msg << 4; + a += *msg >> 4; + a *= 11; + msg++; + } + return a; +} diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/ibalance.c linux/fs/reiserfs/ibalance.c --- v2.4.0/linux/fs/reiserfs/ibalance.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/ibalance.c Mon Jan 15 15:31:19 2001 @@ -0,0 +1,1140 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include + +#else + +#include "nokernel.h" + +#endif + + +/* this is one and only function that is used outside (do_balance.c) */ +int balance_internal ( + struct tree_balance * , + int, + int, + struct item_head * , + struct buffer_head ** + ); + +/* modes of internal_shift_left, internal_shift_right and internal_insert_childs */ +#define INTERNAL_SHIFT_FROM_S_TO_L 0 +#define INTERNAL_SHIFT_FROM_R_TO_S 1 +#define INTERNAL_SHIFT_FROM_L_TO_S 2 +#define INTERNAL_SHIFT_FROM_S_TO_R 3 +#define INTERNAL_INSERT_TO_S 4 +#define INTERNAL_INSERT_TO_L 5 +#define INTERNAL_INSERT_TO_R 6 + +static void internal_define_dest_src_infos ( + int shift_mode, + struct tree_balance * tb, + int h, + struct buffer_info * dest_bi, + struct buffer_info * src_bi, + int * d_key, + struct buffer_head ** cf + ) +{ +#ifdef CONFIG_REISERFS_CHECK + memset (dest_bi, 0, sizeof (struct buffer_info)); + memset (src_bi, 0, sizeof (struct buffer_info)); +#endif + /* define dest, src, dest parent, dest position */ + switch (shift_mode) { + case INTERNAL_SHIFT_FROM_S_TO_L: /* used in internal_shift_left */ + src_bi->tb = tb; + src_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); + src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); + src_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[h]; + dest_bi->bi_parent = tb->FL[h]; + dest_bi->bi_position = get_left_neighbor_position (tb, h); + *d_key = tb->lkey[h]; + *cf = tb->CFL[h]; + break; + case INTERNAL_SHIFT_FROM_L_TO_S: + src_bi->tb = tb; + src_bi->bi_bh = tb->L[h]; + src_bi->bi_parent = tb->FL[h]; + src_bi->bi_position = get_left_neighbor_position (tb, h); + dest_bi->tb = tb; + dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); + dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); + dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); /* dest position is analog of dest->b_item_order */ + *d_key = tb->lkey[h]; + *cf = tb->CFL[h]; + break; + + case INTERNAL_SHIFT_FROM_R_TO_S: /* used in internal_shift_left */ + src_bi->tb = tb; + src_bi->bi_bh = tb->R[h]; + src_bi->bi_parent = tb->FR[h]; + src_bi->bi_position = get_right_neighbor_position (tb, h); + dest_bi->tb = tb; + dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); + dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); + dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); + *d_key = tb->rkey[h]; + *cf = tb->CFR[h]; + break; + + case INTERNAL_SHIFT_FROM_S_TO_R: + src_bi->tb = tb; + src_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); + src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); + src_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[h]; + dest_bi->bi_parent = tb->FR[h]; + dest_bi->bi_position = get_right_neighbor_position (tb, h); + *d_key = tb->rkey[h]; + *cf = tb->CFR[h]; + break; + + case INTERNAL_INSERT_TO_L: + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[h]; + dest_bi->bi_parent = tb->FL[h]; + dest_bi->bi_position = get_left_neighbor_position (tb, h); + break; + + case INTERNAL_INSERT_TO_S: + dest_bi->tb = tb; + dest_bi->bi_bh = PATH_H_PBUFFER (tb->tb_path, h); + dest_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, h); + dest_bi->bi_position = PATH_H_POSITION (tb->tb_path, h + 1); + break; + + case INTERNAL_INSERT_TO_R: + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[h]; + dest_bi->bi_parent = tb->FR[h]; + dest_bi->bi_position = get_right_neighbor_position (tb, h); + break; + + default: + reiserfs_panic (tb->tb_sb, "internal_define_dest_src_infos", "shift type is unknown (%d)", shift_mode); + } +} + + + +/* Insert count node pointers into buffer cur before position to + 1. + * Insert count items into buffer cur before position to. + * Items and node pointers are specified by inserted and bh respectively. + */ +static void internal_insert_childs (struct buffer_info * cur_bi, + int to, int count, + struct item_head * inserted, + struct buffer_head ** bh + ) +{ + struct buffer_head * cur = cur_bi->bi_bh; + struct block_head * blkh; + int nr; + struct key * ih; + struct disk_child new_dc[2]; + struct disk_child * dc; + int i; + + if (count <= 0) + return; + + nr = le16_to_cpu ((blkh = B_BLK_HEAD(cur))->blk_nr_item); + +#ifdef CONFIG_REISERFS_CHECK + if (count > 2) + reiserfs_panic (0, "internal_insert_childs", "too many children (%d) are to be inserted", count); + if (B_FREE_SPACE (cur) < count * (KEY_SIZE + DC_SIZE)) + reiserfs_panic (0, "internal_insert_childs", "no enough free space (%d), needed %d bytes", + B_FREE_SPACE (cur), count * (KEY_SIZE + DC_SIZE)); +#endif /* CONFIG_REISERFS_CHECK */ + + /* prepare space for count disk_child */ + dc = B_N_CHILD(cur,to+1); + + memmove (dc + count, dc, (nr+1-(to+1)) * DC_SIZE); + + /* copy to_be_insert disk children */ + for (i = 0; i < count; i ++) { + new_dc[i].dc_size = + cpu_to_le16 (MAX_CHILD_SIZE(bh[i]) - B_FREE_SPACE (bh[i])); + new_dc[i].dc_block_number = cpu_to_le32 (bh[i]->b_blocknr); + } + memcpy (dc, new_dc, DC_SIZE * count); + + + /* prepare space for count items */ + ih = B_N_PDELIM_KEY (cur, ((to == -1) ? 0 : to)); + + memmove (ih + count, ih, (nr - to) * KEY_SIZE + (nr + 1 + count) * DC_SIZE); + + /* copy item headers (keys) */ + memcpy (ih, inserted, KEY_SIZE); + if ( count > 1 ) + memcpy (ih + 1, inserted + 1, KEY_SIZE); + + /* sizes, item number */ + blkh->blk_nr_item = cpu_to_le16 (le16_to_cpu (blkh->blk_nr_item) + count); + blkh->blk_free_space = cpu_to_le16 (le16_to_cpu (blkh->blk_free_space) - count * (DC_SIZE + KEY_SIZE)); + + do_balance_mark_internal_dirty (cur_bi->tb, cur,0); + + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + check_internal (cur); + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + + if (cur_bi->bi_parent) { + B_N_CHILD (cur_bi->bi_parent,cur_bi->bi_position)->dc_size += count * (DC_SIZE + KEY_SIZE); + do_balance_mark_internal_dirty(cur_bi->tb, cur_bi->bi_parent, 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + check_internal (cur_bi->bi_parent); + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + } + +} + + +/* Delete del_num items and node pointers from buffer cur starting from * + * the first_i'th item and first_p'th pointers respectively. */ +static void internal_delete_pointers_items ( + struct buffer_info * cur_bi, + int first_p, + int first_i, + int del_num + ) +{ + struct buffer_head * cur = cur_bi->bi_bh; + int nr; + struct block_head * blkh; + struct key * key; + struct disk_child * dc; + +#ifdef CONFIG_REISERFS_CHECK + if (cur == NULL) + reiserfs_panic (0, "internal_delete_pointers_items1: buffer is 0"); + + if (del_num < 0) + reiserfs_panic (0, "internal_delete_pointers_items2", + "negative number of items (%d) can not be deleted", del_num); + + if (first_p < 0 || first_p + del_num > B_NR_ITEMS (cur) + 1 || first_i < 0) + reiserfs_panic (0, "internal_delete_pointers_items3", + "first pointer order (%d) < 0 or " + "no so many pointers (%d), only (%d) or " + "first key order %d < 0", first_p, + first_p + del_num, B_NR_ITEMS (cur) + 1, first_i); +#endif /* CONFIG_REISERFS_CHECK */ + if ( del_num == 0 ) + return; + + nr = le16_to_cpu ((blkh = B_BLK_HEAD(cur))->blk_nr_item); + + if ( first_p == 0 && del_num == nr + 1 ) { +#ifdef CONFIG_REISERFS_CHECK + if ( first_i != 0 ) + reiserfs_panic (0, "internal_delete_pointers_items5", + "first deleted key must have order 0, not %d", first_i); +#endif /* CONFIG_REISERFS_CHECK */ + make_empty_node (cur_bi); + return; + } + +#ifdef CONFIG_REISERFS_CHECK + if (first_i + del_num > B_NR_ITEMS (cur)) { + printk("first_i = %d del_num = %d\n",first_i,del_num); + reiserfs_panic (0, "internal_delete_pointers_items4: :" + "no so many keys (%d) in the node (%b)(%z)", first_i + del_num, cur, cur); + } +#endif /* CONFIG_REISERFS_CHECK */ + + + /* deleting */ + dc = B_N_CHILD (cur, first_p); + + memmove (dc, dc + del_num, (nr + 1 - first_p - del_num) * DC_SIZE); + key = B_N_PDELIM_KEY (cur, first_i); + memmove (key, key + del_num, (nr - first_i - del_num) * KEY_SIZE + (nr + 1 - del_num) * DC_SIZE); + + + /* sizes, item number */ + blkh->blk_nr_item = cpu_to_le16 (le16_to_cpu (blkh->blk_nr_item) - del_num); + blkh->blk_free_space = cpu_to_le16 (le16_to_cpu (blkh->blk_free_space) + del_num * (KEY_SIZE + DC_SIZE)); + + do_balance_mark_internal_dirty (cur_bi->tb, cur, 0); + /*&&&&&&&&&&&&&&&&&&&&&&&*/ + check_internal (cur); + /*&&&&&&&&&&&&&&&&&&&&&&&*/ + + if (cur_bi->bi_parent) { + B_N_CHILD (cur_bi->bi_parent, cur_bi->bi_position)->dc_size -= del_num * (KEY_SIZE + DC_SIZE); + do_balance_mark_internal_dirty (cur_bi->tb, cur_bi->bi_parent,0); + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + check_internal (cur_bi->bi_parent); + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + } +} + + +/* delete n node pointers and items starting from given position */ +static void internal_delete_childs (struct buffer_info * cur_bi, + int from, int n) +{ + int i_from; + + i_from = (from == 0) ? from : from - 1; + + /* delete n pointers starting from `from' position in CUR; + delete n keys starting from 'i_from' position in CUR; + */ + internal_delete_pointers_items (cur_bi, from, i_from, n); +} + + +/* copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest +* last_first == FIRST_TO_LAST means, that we copy first items from src to tail of dest + * last_first == LAST_TO_FIRST means, that we copy last items from src to head of dest + */ +static void internal_copy_pointers_items ( + struct buffer_info * dest_bi, + struct buffer_head * src, + int last_first, int cpy_num + ) +{ + /* ATTENTION! Number of node pointers in DEST is equal to number of items in DEST * + * as delimiting key have already inserted to buffer dest.*/ + struct buffer_head * dest = dest_bi->bi_bh; + int nr_dest, nr_src; + int dest_order, src_order; + struct block_head * blkh; + struct key * key; + struct disk_child * dc; + + nr_src = B_NR_ITEMS (src); + +#ifdef CONFIG_REISERFS_CHECK + if ( dest == NULL || src == NULL ) + reiserfs_panic (0, "internal_copy_pointers_items", "src (%p) or dest (%p) buffer is 0", src, dest); + + if (last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST) + reiserfs_panic (0, "internal_copy_pointers_items", + "invalid last_first parameter (%d)", last_first); + + if ( nr_src < cpy_num - 1 ) + reiserfs_panic (0, "internal_copy_pointers_items", "no so many items (%d) in src (%d)", cpy_num, nr_src); + + if ( cpy_num < 0 ) + reiserfs_panic (0, "internal_copy_pointers_items", "cpy_num less than 0 (%d)", cpy_num); + + if (cpy_num - 1 + B_NR_ITEMS(dest) > (int)MAX_NR_KEY(dest)) + reiserfs_panic (0, "internal_copy_pointers_items", + "cpy_num (%d) + item number in dest (%d) can not be more than MAX_NR_KEY(%d)", + cpy_num, B_NR_ITEMS(dest), MAX_NR_KEY(dest)); +#endif + + if ( cpy_num == 0 ) + return; + + /* coping */ + nr_dest = le16_to_cpu ((blkh = B_BLK_HEAD(dest))->blk_nr_item); + + /*dest_order = (last_first == LAST_TO_FIRST) ? 0 : nr_dest;*/ + /*src_order = (last_first == LAST_TO_FIRST) ? (nr_src - cpy_num + 1) : 0;*/ + (last_first == LAST_TO_FIRST) ? (dest_order = 0, src_order = nr_src - cpy_num + 1) : + (dest_order = nr_dest, src_order = 0); + + /* prepare space for cpy_num pointers */ + dc = B_N_CHILD (dest, dest_order); + + memmove (dc + cpy_num, dc, (nr_dest - dest_order) * DC_SIZE); + + /* insert pointers */ + memcpy (dc, B_N_CHILD (src, src_order), DC_SIZE * cpy_num); + + + /* prepare space for cpy_num - 1 item headers */ + key = B_N_PDELIM_KEY(dest, dest_order); + memmove (key + cpy_num - 1, key, + KEY_SIZE * (nr_dest - dest_order) + DC_SIZE * (nr_dest + cpy_num)); + + + /* insert headers */ + memcpy (key, B_N_PDELIM_KEY (src, src_order), KEY_SIZE * (cpy_num - 1)); + + /* sizes, item number */ + blkh->blk_nr_item = cpu_to_le16 (le16_to_cpu (blkh->blk_nr_item) + (cpy_num - 1)); + blkh->blk_free_space = cpu_to_le16 (le16_to_cpu (blkh->blk_free_space) - (KEY_SIZE * (cpy_num - 1) + DC_SIZE * cpy_num)); + + do_balance_mark_internal_dirty (dest_bi->tb, dest, 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + check_internal (dest); + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + + if (dest_bi->bi_parent) { + B_N_CHILD(dest_bi->bi_parent,dest_bi->bi_position)->dc_size += + KEY_SIZE * (cpy_num - 1) + DC_SIZE * cpy_num; + + do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent,0); + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + check_internal (dest_bi->bi_parent); + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + } + +} + + +/* Copy cpy_num node pointers and cpy_num - 1 items from buffer src to buffer dest. + * Delete cpy_num - del_par items and node pointers from buffer src. + * last_first == FIRST_TO_LAST means, that we copy/delete first items from src. + * last_first == LAST_TO_FIRST means, that we copy/delete last items from src. + */ +static void internal_move_pointers_items (struct buffer_info * dest_bi, + struct buffer_info * src_bi, + int last_first, int cpy_num, int del_par) +{ + int first_pointer; + int first_item; + + internal_copy_pointers_items (dest_bi, src_bi->bi_bh, last_first, cpy_num); + + if (last_first == FIRST_TO_LAST) { /* shift_left occurs */ + first_pointer = 0; + first_item = 0; + /* delete cpy_num - del_par pointers and keys starting for pointers with first_pointer, + for key - with first_item */ + internal_delete_pointers_items (src_bi, first_pointer, first_item, cpy_num - del_par); + } else { /* shift_right occurs */ + int i, j; + + i = ( cpy_num - del_par == ( j = B_NR_ITEMS(src_bi->bi_bh)) + 1 ) ? 0 : j - cpy_num + del_par; + + internal_delete_pointers_items (src_bi, j + 1 - cpy_num + del_par, i, cpy_num - del_par); + } +} + +/* Insert n_src'th key of buffer src before n_dest'th key of buffer dest. */ +static void internal_insert_key (struct buffer_info * dest_bi, + int dest_position_before, /* insert key before key with n_dest number */ + struct buffer_head * src, + int src_position) +{ + struct buffer_head * dest = dest_bi->bi_bh; + int nr; + struct block_head * blkh; + struct key * key; + +#ifdef CONFIG_REISERFS_CHECK + if (dest == NULL || src == NULL) + reiserfs_panic (0, "internal_insert_key", "sourse(%p) or dest(%p) buffer is 0", src, dest); + + if (dest_position_before < 0 || src_position < 0) + reiserfs_panic (0, "internal_insert_key", "source(%d) or dest(%d) key number less than 0", + src_position, dest_position_before); + + if (dest_position_before > B_NR_ITEMS (dest) || src_position >= B_NR_ITEMS(src)) + reiserfs_panic (0, "internal_insert_key", + "invalid position in dest (%d (key number %d)) or in src (%d (key number %d))", + dest_position_before, B_NR_ITEMS (dest), src_position, B_NR_ITEMS(src)); + + if (B_FREE_SPACE (dest) < KEY_SIZE) + reiserfs_panic (0, "internal_insert_key", + "no enough free space (%d) in dest buffer", B_FREE_SPACE (dest)); +#endif + + nr = le16_to_cpu ((blkh=B_BLK_HEAD(dest))->blk_nr_item); + + /* prepare space for inserting key */ + key = B_N_PDELIM_KEY (dest, dest_position_before); + memmove (key + 1, key, (nr - dest_position_before) * KEY_SIZE + (nr + 1) * DC_SIZE); + + /* insert key */ + memcpy (key, B_N_PDELIM_KEY(src, src_position), KEY_SIZE); + + /* Change dirt, free space, item number fields. */ + blkh->blk_nr_item = cpu_to_le16 (le16_to_cpu (blkh->blk_nr_item) + 1); + blkh->blk_free_space = cpu_to_le16 (le16_to_cpu (blkh->blk_free_space) - KEY_SIZE); + + do_balance_mark_internal_dirty (dest_bi->tb, dest, 0); + + if (dest_bi->bi_parent) { + B_N_CHILD(dest_bi->bi_parent,dest_bi->bi_position)->dc_size += KEY_SIZE; + do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent,0); + } +} + + + +/* Insert d_key'th (delimiting) key from buffer cfl to tail of dest. + * Copy pointer_amount node pointers and pointer_amount - 1 items from buffer src to buffer dest. + * Replace d_key'th key in buffer cfl. + * Delete pointer_amount items and node pointers from buffer src. + */ +/* this can be invoked both to shift from S to L and from R to S */ +static void internal_shift_left ( + int mode, /* INTERNAL_FROM_S_TO_L | INTERNAL_FROM_R_TO_S */ + struct tree_balance * tb, + int h, + int pointer_amount + ) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head * cf; + int d_key_position; + + internal_define_dest_src_infos (mode, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); + + /*printk("pointer_amount = %d\n",pointer_amount);*/ + + if (pointer_amount) { + /* insert delimiting key from common father of dest and src to node dest into position B_NR_ITEM(dest) */ + internal_insert_key (&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, d_key_position); + + if (B_NR_ITEMS(src_bi.bi_bh) == pointer_amount - 1) { + if (src_bi.bi_position/*src->b_item_order*/ == 0) + replace_key (tb, cf, d_key_position, src_bi.bi_parent/*src->b_parent*/, 0); + } else + replace_key (tb, cf, d_key_position, src_bi.bi_bh, pointer_amount - 1); + } + /* last parameter is del_parameter */ + internal_move_pointers_items (&dest_bi, &src_bi, FIRST_TO_LAST, pointer_amount, 0); + +} + +/* Insert delimiting key to L[h]. + * Copy n node pointers and n - 1 items from buffer S[h] to L[h]. + * Delete n - 1 items and node pointers from buffer S[h]. + */ +/* it always shifts from S[h] to L[h] */ +static void internal_shift1_left ( + struct tree_balance * tb, + int h, + int pointer_amount + ) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head * cf; + int d_key_position; + + internal_define_dest_src_infos (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); + + if ( pointer_amount > 0 ) /* insert lkey[h]-th key from CFL[h] to left neighbor L[h] */ + internal_insert_key (&dest_bi, B_NR_ITEMS(dest_bi.bi_bh), cf, d_key_position); + /* internal_insert_key (tb->L[h], B_NR_ITEM(tb->L[h]), tb->CFL[h], tb->lkey[h]);*/ + + /* last parameter is del_parameter */ + internal_move_pointers_items (&dest_bi, &src_bi, FIRST_TO_LAST, pointer_amount, 1); + /* internal_move_pointers_items (tb->L[h], tb->S[h], FIRST_TO_LAST, pointer_amount, 1);*/ +} + + +/* Insert d_key'th (delimiting) key from buffer cfr to head of dest. + * Copy n node pointers and n - 1 items from buffer src to buffer dest. + * Replace d_key'th key in buffer cfr. + * Delete n items and node pointers from buffer src. + */ +static void internal_shift_right ( + int mode, /* INTERNAL_FROM_S_TO_R | INTERNAL_FROM_L_TO_S */ + struct tree_balance * tb, + int h, + int pointer_amount + ) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head * cf; + int d_key_position; + int nr; + + + internal_define_dest_src_infos (mode, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); + + nr = B_NR_ITEMS (src_bi.bi_bh); + + if (pointer_amount > 0) { + /* insert delimiting key from common father of dest and src to dest node into position 0 */ + internal_insert_key (&dest_bi, 0, cf, d_key_position); + if (nr == pointer_amount - 1) { +#ifdef CONFIG_REISERFS_CHECK + if ( src_bi.bi_bh != PATH_H_PBUFFER (tb->tb_path, h)/*tb->S[h]*/ || dest_bi.bi_bh != tb->R[h]) + reiserfs_panic (tb->tb_sb, "internal_shift_right", "src (%p) must be == tb->S[h](%p) when it disappears", + src_bi.bi_bh, PATH_H_PBUFFER (tb->tb_path, h)); +#endif + /* when S[h] disappers replace left delemiting key as well */ + if (tb->CFL[h]) + replace_key (tb, cf, d_key_position, tb->CFL[h], tb->lkey[h]); + } else + replace_key (tb, cf, d_key_position, src_bi.bi_bh, nr - pointer_amount); + } + + /* last parameter is del_parameter */ + internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, pointer_amount, 0); +} + +/* Insert delimiting key to R[h]. + * Copy n node pointers and n - 1 items from buffer S[h] to R[h]. + * Delete n - 1 items and node pointers from buffer S[h]. + */ +/* it always shift from S[h] to R[h] */ +static void internal_shift1_right ( + struct tree_balance * tb, + int h, + int pointer_amount + ) +{ + struct buffer_info dest_bi, src_bi; + struct buffer_head * cf; + int d_key_position; + + internal_define_dest_src_infos (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, &dest_bi, &src_bi, &d_key_position, &cf); + + if (pointer_amount > 0) /* insert rkey from CFR[h] to right neighbor R[h] */ + internal_insert_key (&dest_bi, 0, cf, d_key_position); + /* internal_insert_key (tb->R[h], 0, tb->CFR[h], tb->rkey[h]);*/ + + /* last parameter is del_parameter */ + internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, pointer_amount, 1); + /* internal_move_pointers_items (tb->R[h], tb->S[h], LAST_TO_FIRST, pointer_amount, 1);*/ +} + + +/* Delete insert_num node pointers together with their left items + * and balance current node.*/ +static void balance_internal_when_delete (struct tree_balance * tb, + int h, int child_pos) +{ + int insert_num; + int n; + struct buffer_head * tbSh = PATH_H_PBUFFER (tb->tb_path, h); + struct buffer_info bi; + + insert_num = tb->insert_size[h] / ((int)(DC_SIZE + KEY_SIZE)); + + /* delete child-node-pointer(s) together with their left item(s) */ + bi.tb = tb; + bi.bi_bh = tbSh; + bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h); + bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1); + + internal_delete_childs (&bi, child_pos, -insert_num); + +#ifdef CONFIG_REISERFS_CHECK + if ( tb->blknum[h] > 1 ) + reiserfs_panic (tb->tb_sb, "balance_internal_when_delete", "tb->blknum[%d]=%d when insert_size < 0", + h, tb->blknum[h]); +#endif /* CONFIG_REISERFS_CHECK */ + + n = B_NR_ITEMS(tbSh); + + if ( tb->lnum[h] == 0 && tb->rnum[h] == 0 ) { + if ( tb->blknum[h] == 0 ) { + /* node S[h] (root of the tree) is empty now */ + struct buffer_head *new_root; + +#ifdef CONFIG_REISERFS_CHECK + if (n || B_FREE_SPACE (tbSh) != MAX_CHILD_SIZE(tbSh) - DC_SIZE) + reiserfs_panic (tb->tb_sb, "balance_internal_when_delete", "buffer must have only 0 keys (%d)", + n); + + if (bi.bi_parent) + reiserfs_panic (tb->tb_sb, "balance_internal_when_delete", "root has parent (%p)", bi.bi_parent); +#endif /* CONFIG_REISERFS_CHECK */ + + /* choose a new root */ + if ( ! tb->L[h-1] || ! B_NR_ITEMS(tb->L[h-1]) ) + new_root = tb->R[h-1]; + else + new_root = tb->L[h-1]; + /* switch super block's tree root block number to the new value */ + tb->tb_sb->u.reiserfs_sb.s_rs->s_root_block = cpu_to_le32 (new_root->b_blocknr); + //tb->tb_sb->u.reiserfs_sb.s_rs->s_tree_height --; + tb->tb_sb->u.reiserfs_sb.s_rs->s_tree_height = cpu_to_le16 (SB_TREE_HEIGHT (tb->tb_sb) - 1); + + do_balance_mark_sb_dirty (tb, tb->tb_sb->u.reiserfs_sb.s_sbh, 1); + /*&&&&&&&&&&&&&&&&&&&&&&*/ + if (h > 1) + /* use check_internal if new root is an internal node */ + check_internal (new_root); + /*&&&&&&&&&&&&&&&&&&&&&&*/ + tb->tb_sb->s_dirt = 1; + + /* do what is needed for buffer thrown from tree */ + reiserfs_invalidate_buffer(tb, tbSh); + return; + } + return; + } + + if ( tb->L[h] && tb->lnum[h] == -B_NR_ITEMS(tb->L[h]) - 1 ) { /* join S[h] with L[h] */ + +#ifdef CONFIG_REISERFS_CHECK + if ( tb->rnum[h] != 0 ) + reiserfs_panic (tb->tb_sb, "balance_internal_when_delete", "invalid tb->rnum[%d]==%d when joining S[h] with L[h]", + h, tb->rnum[h]); +#endif /* CONFIG_REISERFS_CHECK */ + + internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, n + 1); + reiserfs_invalidate_buffer(tb, tbSh); + + return; + } + + if ( tb->R[h] && tb->rnum[h] == -B_NR_ITEMS(tb->R[h]) - 1 ) { /* join S[h] with R[h] */ +#ifdef CONFIG_REISERFS_CHECK + if ( tb->lnum[h] != 0 ) + reiserfs_panic (tb->tb_sb, "balance_internal_when_delete", "invalid tb->lnum[%d]==%d when joining S[h] with R[h]", + h, tb->lnum[h]); +#endif /* CONFIG_REISERFS_CHECK */ + + internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, n + 1); + + reiserfs_invalidate_buffer(tb,tbSh); + return; + } + + if ( tb->lnum[h] < 0 ) { /* borrow from left neighbor L[h] */ +#ifdef CONFIG_REISERFS_CHECK + if ( tb->rnum[h] != 0 ) + reiserfs_panic (tb->tb_sb, "balance_internal_when_delete", "invalid tb->rnum[%d]==%d when borrow from L[h]", + h, tb->rnum[h]); +#endif /* CONFIG_REISERFS_CHECK */ + /*internal_shift_right (tb, h, tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], -tb->lnum[h]);*/ + internal_shift_right (INTERNAL_SHIFT_FROM_L_TO_S, tb, h, -tb->lnum[h]); + return; + } + + if ( tb->rnum[h] < 0 ) { /* borrow from right neighbor R[h] */ +#ifdef CONFIG_REISERFS_CHECK + if ( tb->lnum[h] != 0 ) + reiserfs_panic (tb->tb_sb, "balance_internal_when_delete", "invalid tb->lnum[%d]==%d when borrow from R[h]", + h, tb->lnum[h]); +#endif /* CONFIG_REISERFS_CHECK */ + internal_shift_left (INTERNAL_SHIFT_FROM_R_TO_S, tb, h, -tb->rnum[h]);/*tb->S[h], tb->CFR[h], tb->rkey[h], tb->R[h], -tb->rnum[h]);*/ + return; + } + + if ( tb->lnum[h] > 0 ) { /* split S[h] into two parts and put them into neighbors */ +#ifdef CONFIG_REISERFS_CHECK + if ( tb->rnum[h] == 0 || tb->lnum[h] + tb->rnum[h] != n + 1 ) + reiserfs_panic (tb->tb_sb, "balance_internal_when_delete", + "invalid tb->lnum[%d]==%d or tb->rnum[%d]==%d when S[h](item number == %d) is split between them", + h, tb->lnum[h], h, tb->rnum[h], n); +#endif /* CONFIG_REISERFS_CHECK */ + + internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]);/*tb->L[h], tb->CFL[h], tb->lkey[h], tb->S[h], tb->lnum[h]);*/ + internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h]); + + reiserfs_invalidate_buffer (tb, tbSh); + + return; + } + reiserfs_panic (tb->tb_sb, "balance_internal_when_delete", "unexpected tb->lnum[%d]==%d or tb->rnum[%d]==%d", + h, tb->lnum[h], h, tb->rnum[h]); +} + + +/* Replace delimiting key of buffers L[h] and S[h] by the given key.*/ +void replace_lkey ( + struct tree_balance * tb, + int h, + struct item_head * key + ) +{ +#ifdef CONFIG_REISERFS_CHECK + if (tb->L[h] == NULL || tb->CFL[h] == NULL) + reiserfs_panic (tb->tb_sb, "replace_lkey: 12255: " + "L[h](%p) and CFL[h](%p) must exist in replace_lkey", tb->L[h], tb->CFL[h]); +#endif + + if (B_NR_ITEMS(PATH_H_PBUFFER(tb->tb_path, h)) == 0) + return; + + memcpy (B_N_PDELIM_KEY(tb->CFL[h],tb->lkey[h]), key, KEY_SIZE); + + do_balance_mark_internal_dirty (tb, tb->CFL[h],0); +} + + +/* Replace delimiting key of buffers S[h] and R[h] by the given key.*/ +void replace_rkey ( + struct tree_balance * tb, + int h, + struct item_head * key + ) +{ +#ifdef CONFIG_REISERFS_CHECK + if (tb->R[h] == NULL || tb->CFR[h] == NULL) + reiserfs_panic (tb->tb_sb, "replace_rkey: 12260: " + "R[h](%p) and CFR[h](%p) must exist in replace_rkey", tb->R[h], tb->CFR[h]); + + if (B_NR_ITEMS(tb->R[h]) == 0) + reiserfs_panic (tb->tb_sb, "replace_rkey: 12265: " + "R[h] can not be empty if it exists (item number=%d)", B_NR_ITEMS(tb->R[h])); +#endif + + memcpy (B_N_PDELIM_KEY(tb->CFR[h],tb->rkey[h]), key, KEY_SIZE); + + do_balance_mark_internal_dirty (tb, tb->CFR[h], 0); +} + + +int balance_internal (struct tree_balance * tb, /* tree_balance structure */ + int h, /* level of the tree */ + int child_pos, + struct item_head * insert_key, /* key for insertion on higher level */ + struct buffer_head ** insert_ptr /* node for insertion on higher level*/ + ) + /* if inserting/pasting + { + child_pos is the position of the node-pointer in S[h] that * + pointed to S[h-1] before balancing of the h-1 level; * + this means that new pointers and items must be inserted AFTER * + child_pos + } + else + { + it is the position of the leftmost pointer that must be deleted (together with + its corresponding key to the left of the pointer) + as a result of the previous level's balancing. + } +*/ +{ + struct buffer_head * tbSh = PATH_H_PBUFFER (tb->tb_path, h); + struct buffer_info bi; + int order; /* we return this: it is 0 if there is no S[h], else it is tb->S[h]->b_item_order */ + int insert_num, n, k; + struct buffer_head * S_new; + struct item_head new_insert_key; + struct buffer_head * new_insert_ptr = NULL; + struct item_head * new_insert_key_addr = insert_key; + +#ifdef CONFIG_REISERFS_CHECK + if ( h < 1 ) + reiserfs_panic (tb->tb_sb, "balance_internal", "h (%d) can not be < 1 on internal level", h); +#endif /* CONFIG_REISERFS_CHECK */ + + order = ( tbSh ) ? PATH_H_POSITION (tb->tb_path, h + 1)/*tb->S[h]->b_item_order*/ : 0; + + /* Using insert_size[h] calculate the number insert_num of items + that must be inserted to or deleted from S[h]. */ + insert_num = tb->insert_size[h]/((int)(KEY_SIZE + DC_SIZE)); + + /* Check whether insert_num is proper **/ +#ifdef CONFIG_REISERFS_CHECK + if ( insert_num < -2 || insert_num > 2 ) + reiserfs_panic (tb->tb_sb, "balance_internal", + "incorrect number of items inserted to the internal node (%d)", insert_num); + + if ( h > 1 && (insert_num > 1 || insert_num < -1) ) + reiserfs_panic (tb->tb_sb, "balance_internal", + "incorrect number of items (%d) inserted to the internal node on a level (h=%d) higher than last internal level", + insert_num, h); +#endif /* CONFIG_REISERFS_CHECK */ + + /* Make balance in case insert_num < 0 */ + if ( insert_num < 0 ) { + balance_internal_when_delete (tb, h, child_pos); + return order; + } + + k = 0; + if ( tb->lnum[h] > 0 ) { + /* shift lnum[h] items from S[h] to the left neighbor L[h]. + check how many of new items fall into L[h] or CFL[h] after + shifting */ + n = B_NR_ITEMS (tb->L[h]); /* number of items in L[h] */ + if ( tb->lnum[h] <= child_pos ) { + /* new items don't fall into L[h] or CFL[h] */ + internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h]); + /*internal_shift_left (tb->L[h],tb->CFL[h],tb->lkey[h],tbSh,tb->lnum[h]);*/ + child_pos -= tb->lnum[h]; + } else if ( tb->lnum[h] > child_pos + insert_num ) { + /* all new items fall into L[h] */ + internal_shift_left (INTERNAL_SHIFT_FROM_S_TO_L, tb, h, tb->lnum[h] - insert_num); + /* internal_shift_left(tb->L[h],tb->CFL[h],tb->lkey[h],tbSh, + tb->lnum[h]-insert_num); + */ + /* insert insert_num keys and node-pointers into L[h] */ + bi.tb = tb; + bi.bi_bh = tb->L[h]; + bi.bi_parent = tb->FL[h]; + bi.bi_position = get_left_neighbor_position (tb, h); + internal_insert_childs (&bi,/*tb->L[h], tb->S[h-1]->b_next*/ n + child_pos + 1, + insert_num,insert_key,insert_ptr); + + insert_num = 0; + } else { + struct disk_child * dc; + + /* some items fall into L[h] or CFL[h], but some don't fall */ + internal_shift1_left(tb,h,child_pos+1); + /* calculate number of new items that fall into L[h] */ + k = tb->lnum[h] - child_pos - 1; + bi.tb = tb; + bi.bi_bh = tb->L[h]; + bi.bi_parent = tb->FL[h]; + bi.bi_position = get_left_neighbor_position (tb, h); + internal_insert_childs (&bi,/*tb->L[h], tb->S[h-1]->b_next,*/ n + child_pos + 1,k, + insert_key,insert_ptr); + + replace_lkey(tb,h,insert_key + k); + + /* replace the first node-ptr in S[h] by node-ptr to insert_ptr[k] */ + dc = B_N_CHILD(tbSh, 0); + dc->dc_size = cpu_to_le16 (MAX_CHILD_SIZE(insert_ptr[k]) - B_FREE_SPACE (insert_ptr[k])); + dc->dc_block_number = cpu_to_le32 (insert_ptr[k]->b_blocknr); + + do_balance_mark_internal_dirty (tb, tbSh, 0); + + k++; + insert_key += k; + insert_ptr += k; + insert_num -= k; + child_pos = 0; + } + } /* tb->lnum[h] > 0 */ + + if ( tb->rnum[h] > 0 ) { + /*shift rnum[h] items from S[h] to the right neighbor R[h]*/ + /* check how many of new items fall into R or CFR after shifting */ + n = B_NR_ITEMS (tbSh); /* number of items in S[h] */ + if ( n - tb->rnum[h] >= child_pos ) + /* new items fall into S[h] */ + /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h],tb->rnum[h]);*/ + internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h]); + else + if ( n + insert_num - tb->rnum[h] < child_pos ) + { + /* all new items fall into R[h] */ + /*internal_shift_right(tb,h,tbSh,tb->CFR[h],tb->rkey[h],tb->R[h], + tb->rnum[h] - insert_num);*/ + internal_shift_right (INTERNAL_SHIFT_FROM_S_TO_R, tb, h, tb->rnum[h] - insert_num); + + /* insert insert_num keys and node-pointers into R[h] */ + bi.tb = tb; + bi.bi_bh = tb->R[h]; + bi.bi_parent = tb->FR[h]; + bi.bi_position = get_right_neighbor_position (tb, h); + internal_insert_childs (&bi, /*tb->R[h],tb->S[h-1]->b_next*/ child_pos - n - insert_num + tb->rnum[h] - 1, + insert_num,insert_key,insert_ptr); + insert_num = 0; + } + else + { + struct disk_child * dc; + + /* one of the items falls into CFR[h] */ + internal_shift1_right(tb,h,n - child_pos + 1); + /* calculate number of new items that fall into R[h] */ + k = tb->rnum[h] - n + child_pos - 1; + bi.tb = tb; + bi.bi_bh = tb->R[h]; + bi.bi_parent = tb->FR[h]; + bi.bi_position = get_right_neighbor_position (tb, h); + internal_insert_childs (&bi, /*tb->R[h], tb->R[h]->b_child,*/ 0, k, insert_key + 1, insert_ptr + 1); + + replace_rkey(tb,h,insert_key + insert_num - k - 1); + + /* replace the first node-ptr in R[h] by node-ptr insert_ptr[insert_num-k-1]*/ + dc = B_N_CHILD(tb->R[h], 0); + dc->dc_size = + cpu_to_le16 (MAX_CHILD_SIZE(insert_ptr[insert_num-k-1]) - + B_FREE_SPACE (insert_ptr[insert_num-k-1])); + dc->dc_block_number = cpu_to_le32 (insert_ptr[insert_num-k-1]->b_blocknr); + + do_balance_mark_internal_dirty (tb, tb->R[h],0); + + insert_num -= (k + 1); + } + } + + /** Fill new node that appears instead of S[h] **/ +#ifdef CONFIG_REISERFS_CHECK + if ( tb->blknum[h] > 2 ) + reiserfs_panic(0, "balance_internal", "blknum can not be > 2 for internal level"); + if ( tb->blknum[h] < 0 ) + reiserfs_panic(0, "balance_internal", "blknum can not be < 0"); +#endif /* CONFIG_REISERFS_CHECK */ + + if ( ! tb->blknum[h] ) + { /* node S[h] is empty now */ +#ifdef CONFIG_REISERFS_CHECK + if ( ! tbSh ) + reiserfs_panic(0,"balance_internal", "S[h] is equal NULL"); +#endif /* CONFIG_REISERFS_CHECK */ + + /* do what is needed for buffer thrown from tree */ + reiserfs_invalidate_buffer(tb,tbSh); + return order; + } + + if ( ! tbSh ) { + /* create new root */ + struct disk_child * dc; + struct buffer_head * tbSh_1 = PATH_H_PBUFFER (tb->tb_path, h - 1); + + + if ( tb->blknum[h] != 1 ) + reiserfs_panic(0, "balance_internal", "One new node required for creating the new root"); + /* S[h] = empty buffer from the list FEB. */ + tbSh = get_FEB (tb); + B_BLK_HEAD(tbSh)->blk_level = cpu_to_le16 (h + 1); + + /* Put the unique node-pointer to S[h] that points to S[h-1]. */ + + dc = B_N_CHILD(tbSh, 0); + dc->dc_block_number = cpu_to_le32 (tbSh_1->b_blocknr); + dc->dc_size = cpu_to_le16 (MAX_CHILD_SIZE (tbSh_1) - B_FREE_SPACE (tbSh_1)); + + tb->insert_size[h] -= DC_SIZE; + B_BLK_HEAD(tbSh)->blk_free_space = cpu_to_le16 (B_FREE_SPACE (tbSh) - DC_SIZE); + + do_balance_mark_internal_dirty (tb, tbSh, 0); + + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + check_internal (tbSh); + /*&&&&&&&&&&&&&&&&&&&&&&&&*/ + + /* put new root into path structure */ + PATH_OFFSET_PBUFFER(tb->tb_path, ILLEGAL_PATH_ELEMENT_OFFSET) = tbSh; + + /* Change root in structure super block. */ + tb->tb_sb->u.reiserfs_sb.s_rs->s_root_block = cpu_to_le32 (tbSh->b_blocknr); + tb->tb_sb->u.reiserfs_sb.s_rs->s_tree_height = cpu_to_le16 (SB_TREE_HEIGHT (tb->tb_sb) + 1); + do_balance_mark_sb_dirty (tb, tb->tb_sb->u.reiserfs_sb.s_sbh, 1); + tb->tb_sb->s_dirt = 1; + } + + if ( tb->blknum[h] == 2 ) { + int snum; + struct buffer_info dest_bi, src_bi; + + + /* S_new = free buffer from list FEB */ + S_new = get_FEB(tb); + + B_BLK_HEAD(S_new)->blk_level = cpu_to_le16 (h + 1); + + dest_bi.tb = tb; + dest_bi.bi_bh = S_new; + dest_bi.bi_parent = 0; + dest_bi.bi_position = 0; + src_bi.tb = tb; + src_bi.bi_bh = tbSh; + src_bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h); + src_bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1); + + n = B_NR_ITEMS (tbSh); /* number of items in S[h] */ + snum = (insert_num + n + 1)/2; + if ( n - snum >= child_pos ) { + /* new items don't fall into S_new */ + /* store the delimiting key for the next level */ + /* new_insert_key = (n - snum)'th key in S[h] */ + memcpy (&new_insert_key,B_N_PDELIM_KEY(tbSh,n - snum), + KEY_SIZE); + /* last parameter is del_par */ + internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, snum, 0); + /* internal_move_pointers_items(S_new, tbSh, LAST_TO_FIRST, snum, 0);*/ + } else if ( n + insert_num - snum < child_pos ) { + /* all new items fall into S_new */ + /* store the delimiting key for the next level */ + /* new_insert_key = (n + insert_item - snum)'th key in S[h] */ + memcpy(&new_insert_key,B_N_PDELIM_KEY(tbSh,n + insert_num - snum), + KEY_SIZE); + /* last parameter is del_par */ + internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, snum - insert_num, 0); + /* internal_move_pointers_items(S_new,tbSh,1,snum - insert_num,0);*/ + + /* insert insert_num keys and node-pointers into S_new */ + internal_insert_childs (&dest_bi, /*S_new,tb->S[h-1]->b_next,*/child_pos - n - insert_num + snum - 1, + insert_num,insert_key,insert_ptr); + + insert_num = 0; + } else { + struct disk_child * dc; + + /* some items fall into S_new, but some don't fall */ + /* last parameter is del_par */ + internal_move_pointers_items (&dest_bi, &src_bi, LAST_TO_FIRST, n - child_pos + 1, 1); + /* internal_move_pointers_items(S_new,tbSh,1,n - child_pos + 1,1);*/ + /* calculate number of new items that fall into S_new */ + k = snum - n + child_pos - 1; + + internal_insert_childs (&dest_bi, /*S_new,*/ 0, k, insert_key + 1, insert_ptr+1); + + /* new_insert_key = insert_key[insert_num - k - 1] */ + memcpy(&new_insert_key,insert_key + insert_num - k - 1, + KEY_SIZE); + /* replace first node-ptr in S_new by node-ptr to insert_ptr[insert_num-k-1] */ + + dc = B_N_CHILD(S_new,0); + dc->dc_size = cpu_to_le16 (MAX_CHILD_SIZE(insert_ptr[insert_num-k-1]) - + B_FREE_SPACE(insert_ptr[insert_num-k-1])); + dc->dc_block_number = cpu_to_le32 (insert_ptr[insert_num-k-1]->b_blocknr); + + do_balance_mark_internal_dirty (tb, S_new,0); + + insert_num -= (k + 1); + } + /* new_insert_ptr = node_pointer to S_new */ + new_insert_ptr = S_new; + +#ifdef CONFIG_REISERFS_CHECK + if ( buffer_locked(S_new) || atomic_read (&(S_new->b_count)) != 1) + if (buffer_locked(S_new) || atomic_read(&(S_new->b_count)) > 2 || + !(buffer_journaled(S_new) || buffer_journal_dirty(S_new))) { + reiserfs_panic (tb->tb_sb, "cm-00001: balance_internal: bad S_new (%b)", S_new); + } +#endif /* CONFIG_REISERFS_CHECK */ + + // S_new is released in unfix_nodes + } + + n = B_NR_ITEMS (tbSh); /*number of items in S[h] */ + +#ifdef REISERFS_FSCK + if ( -1 <= child_pos && child_pos <= n && insert_num > 0 ) { +#else + if ( 0 <= child_pos && child_pos <= n && insert_num > 0 ) { +#endif + bi.tb = tb; + bi.bi_bh = tbSh; + bi.bi_parent = PATH_H_PPARENT (tb->tb_path, h); + bi.bi_position = PATH_H_POSITION (tb->tb_path, h + 1); +#ifdef REISERFS_FSCK + if (child_pos == -1) { + /* this is a little different from original do_balance: + here we insert the minimal keys in the tree, that has never happened when file system works */ + if (tb->CFL[h-1] || insert_num != 1 || h != 1) + die ("balance_internal: invalid child_pos"); +/* insert_child (tb->S[h], tb->S[h-1], child_pos, insert_num, B_N_ITEM_HEAD(tb->S[0],0), insert_ptr);*/ + internal_insert_childs (&bi, child_pos, insert_num, B_N_PITEM_HEAD (PATH_PLAST_BUFFER (tb->tb_path), 0), insert_ptr); + } else +#endif + internal_insert_childs ( + &bi,/*tbSh,*/ + /* ( tb->S[h-1]->b_parent == tb->S[h] ) ? tb->S[h-1]->b_next : tb->S[h]->b_child->b_next,*/ + child_pos,insert_num,insert_key,insert_ptr + ); + } + + + memcpy (new_insert_key_addr,&new_insert_key,KEY_SIZE); + insert_ptr[0] = new_insert_ptr; + + return order; + } + + + diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/inode.c linux/fs/reiserfs/inode.c --- v2.4.0/linux/fs/reiserfs/inode.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/inode.c Mon Jan 15 15:31:19 2001 @@ -0,0 +1,1879 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include + +#else + +#include "nokernel.h" + +#endif + +/* args for the create parameter of reiserfs_get_block */ +#define GET_BLOCK_NO_CREATE 0 /* don't create new blocks or convert tails */ +#define GET_BLOCK_CREATE 1 /* add anything you need to find block */ +#define GET_BLOCK_NO_HOLE 2 /* return -ENOENT for file holes */ +#define GET_BLOCK_READ_DIRECT 4 /* read the tail if indirect item not found */ + +// +// initially this function was derived from minix or ext2's analog and +// evolved as the prototype did +// +void reiserfs_delete_inode (struct inode * inode) +{ + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2; + int windex ; + struct reiserfs_transaction_handle th ; + + + lock_kernel() ; + + /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ + if (INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ + down (&inode->i_sem); + + journal_begin(&th, inode->i_sb, jbegin_count) ; + windex = push_journal_writer("delete_inode") ; + + reiserfs_delete_object (&th, inode); + reiserfs_remove_page_from_flush_list(&th, inode) ; + pop_journal_writer(windex) ; + reiserfs_release_objectid (&th, inode->i_ino); + + journal_end(&th, inode->i_sb, jbegin_count) ; + + up (&inode->i_sem); + } else { + /* no object items are in the tree */ + ; + } + clear_inode (inode); /* note this must go after the journal_end to prevent deadlock */ + unlock_kernel() ; +} + +static void _make_cpu_key (struct cpu_key * key, int version, __u32 dirid, __u32 objectid, + loff_t offset, int type, int length) +{ + key->version = version; + + key->on_disk_key.k_dir_id = dirid; + key->on_disk_key.k_objectid = objectid; + set_cpu_key_k_offset (key, offset); + set_cpu_key_k_type (key, type); + key->key_length = length; +} + + +/* take base of inode_key (it comes from inode always) (dirid, objectid) and version from an inode, set + offset and type of key */ +void make_cpu_key (struct cpu_key * key, const struct inode * inode, loff_t offset, + int type, int length) +{ + _make_cpu_key (key, inode_items_version (inode), le32_to_cpu (INODE_PKEY (inode)->k_dir_id), + le32_to_cpu (INODE_PKEY (inode)->k_objectid), + offset, type, length); +} + + +// +// when key is 0, do not set version and short key +// +inline void make_le_item_head (struct item_head * ih, struct cpu_key * key, int version, + loff_t offset, int type, int length, int entry_count/*or ih_free_space*/) +{ + if (key) { + ih->ih_key.k_dir_id = cpu_to_le32 (key->on_disk_key.k_dir_id); + ih->ih_key.k_objectid = cpu_to_le32 (key->on_disk_key.k_objectid); + } + ih->ih_version = cpu_to_le16 (version); + set_le_ih_k_offset (ih, offset); + set_le_ih_k_type (ih, type); + ih->ih_item_len = cpu_to_le16 (length); + /* set_ih_free_space (ih, 0);*/ + // for directory items it is entry count, for directs and stat + // datas - 0xffff, for indirects - 0 + ih->u.ih_entry_count = cpu_to_le16 (entry_count); +} + + +// +// FIXME: we might cache recently accessed indirect item (or at least +// first 15 pointers just like ext2 does + +// Ugh. Not too eager for that.... +// I cut the code until such time as I see a convincing argument (benchmark). +// I don't want a bloated inode struct..., and I don't like code complexity.... + +/* cutting the code is fine, since it really isn't in use yet and is easy +** to add back in. But, Vladimir has a really good idea here. Think +** about what happens for reading a file. For each page, +** The VFS layer calls reiserfs_readpage, who searches the tree to find +** an indirect item. This indirect item has X number of pointers, where +** X is a big number if we've done the block allocation right. But, +** we only use one or two of these pointers during each call to readpage, +** needlessly researching again later on. +** +** The size of the cache could be dynamic based on the size of the file. +** +** I'd also like to see us cache the location the stat data item, since +** we are needlessly researching for that frequently. +** +** --chris +*/ + +/* people who call journal_begin with a page locked must call this +** BEFORE calling journal_begin +*/ +static int prevent_flush_page_lock(struct page *page, + struct inode *inode) { + struct reiserfs_page_list *pl ; + struct super_block *s = inode->i_sb ; + /* we don't care if the inode has a stale pointer from an old + ** transaction + */ + if(!page || inode->u.reiserfs_i.i_conversion_trans_id != SB_JOURNAL(s)->j_trans_id) { + return 0 ; + } + pl = inode->u.reiserfs_i.i_converted_page ; + if (pl && pl->page == page) { + pl->do_not_lock = 1 ; + } + /* this last part is really important. The address space operations have + ** the page locked before they call the journal functions. So it is possible + ** for one process to be waiting in flush_pages_before_commit for a + ** page, then for the process with the page locked to call journal_begin. + ** + ** We'll deadlock because the process flushing pages will never notice + ** the process with the page locked has called prevent_flush_page_lock. + ** So, we wake up the page waiters, even though the page is still locked. + ** The process waiting in flush_pages_before_commit must check the + ** pl->do_not_lock flag, and stop trying to lock the page. + */ + wake_up(&page->wait) ; + return 0 ; + +} +/* people who call journal_end with a page locked must call this +** AFTER calling journal_end +*/ +static int allow_flush_page_lock(struct page *page, + struct inode *inode) { + + struct reiserfs_page_list *pl ; + struct super_block *s = inode->i_sb ; + /* we don't care if the inode has a stale pointer from an old + ** transaction + */ + if(!page || inode->u.reiserfs_i.i_conversion_trans_id != SB_JOURNAL(s)->j_trans_id) { + return 0 ; + } + pl = inode->u.reiserfs_i.i_converted_page ; + if (pl && pl->page == page) { + pl->do_not_lock = 0 ; + } + return 0 ; + +} + +/* If this page has a file tail in it, and +** it was read in by get_block_create_0, the page data is valid, +** but tail is still sitting in a direct item, and we can't write to +** it. So, look through this page, and check all the mapped buffers +** to make sure they have valid block numbers. Any that don't need +** to be unmapped, so that block_prepare_write will correctly call +** reiserfs_get_block to convert the tail into an unformatted node +*/ +static inline void fix_tail_page_for_writing(struct page *page) { + struct buffer_head *head, *next, *bh ; + + if (page && page->buffers) { + head = page->buffers ; + bh = head ; + do { + next = bh->b_this_page ; + if (buffer_mapped(bh) && bh->b_blocknr == 0) { + reiserfs_unmap_buffer(bh) ; + } + bh = next ; + } while (bh != head) ; + } +} + + + + +/* we need to allocate a block for new unformatted node. Try to figure out + what point in bitmap reiserfs_new_blocknrs should start from. */ +static b_blocknr_t find_tag (struct buffer_head * bh, struct item_head * ih, + __u32 * item, int pos_in_item) +{ + if (!is_indirect_le_ih (ih)) + /* something more complicated could be here */ + return bh->b_blocknr; + + /* for indirect item: go to left and look for the first non-hole entry in + the indirect item */ + if (pos_in_item == I_UNFM_NUM (ih)) + pos_in_item --; + while (pos_in_item >= 0) { + if (item [pos_in_item]) + return item [pos_in_item]; + pos_in_item --; + } + return bh->b_blocknr; +} + + +/* reiserfs_get_block does not need to allocate a block only if it has been + done already or non-hole position has been found in the indirect item */ +static inline int allocation_needed (int retval, b_blocknr_t allocated, + struct item_head * ih, + __u32 * item, int pos_in_item) +{ + if (allocated) + return 0; + if (retval == POSITION_FOUND && is_indirect_le_ih (ih) && item[pos_in_item]) + return 0; + return 1; +} + +static inline int indirect_item_found (int retval, struct item_head * ih) +{ + return (retval == POSITION_FOUND) && is_indirect_le_ih (ih); +} + + +static inline void set_block_dev_mapped (struct buffer_head * bh, + b_blocknr_t block, struct inode * inode) +{ + bh->b_dev = inode->i_dev; + bh->b_blocknr = block; + bh->b_state |= (1UL << BH_Mapped); +} + + +// +// files which were created in the earlier version can not be longer, +// than 2 gb +// +int file_capable (struct inode * inode, long block) +{ + if (inode_items_version (inode) != ITEM_VERSION_1 || // it is new file. + block < (1 << (31 - inode->i_sb->s_blocksize_bits))) // old file, but 'block' is inside of 2gb + return 1; + + return 0; +} + +/*static*/ void restart_transaction(struct reiserfs_transaction_handle *th, + struct inode *inode, struct path *path) { + struct super_block *s = th->t_super ; + int len = th->t_blocks_allocated ; + + pathrelse(path) ; + reiserfs_update_sd(th, inode) ; + journal_end(th, s, len) ; + journal_begin(th, s, len) ; +} + +// it is called by get_block when create == 0. Returns block number +// for 'block'-th logical block of file. When it hits direct item it +// returns 0 (being called from bmap) or read direct item into piece +// of page (bh_result) + +// Please improve the english/clarity in the comment above, as it is +// hard to understand. + +static int _get_block_create_0 (struct inode * inode, long block, + struct buffer_head * bh_result, + int args) +{ + INITIALIZE_PATH (path); + struct cpu_key key; + struct buffer_head * bh; + struct item_head * ih, tmp_ih; + int fs_gen ; + int blocknr; + char * p = NULL; + int chars; + int ret ; + unsigned long offset ; + + // prepare the key to look for the 'block'-th block of file + make_cpu_key (&key, inode, + (loff_t)block * inode->i_sb->s_blocksize + 1, TYPE_ANY, 3); + +research: + if (search_for_position_by_key (inode->i_sb, &key, &path) != POSITION_FOUND) { + pathrelse (&path); + if (p) + kunmap(bh_result->b_page) ; + if ((args & GET_BLOCK_NO_HOLE)) { + return -ENOENT ; + } + return 0 ; + } + + // + bh = get_bh (&path); + ih = get_ih (&path); + if (is_indirect_le_ih (ih)) { + __u32 * ind_item = (__u32 *)B_I_PITEM (bh, ih); + + /* FIXME: here we could cache indirect item or part of it in + the inode to avoid search_by_key in case of subsequent + access to file */ + blocknr = le32_to_cpu (ind_item [path.pos_in_item]); + ret = 0 ; + if (blocknr) { + bh_result->b_dev = inode->i_dev; + bh_result->b_blocknr = blocknr; + bh_result->b_state |= (1UL << BH_Mapped); + } else if ((args & GET_BLOCK_NO_HOLE)) { + ret = -ENOENT ; + } + pathrelse (&path); + if (p) + kunmap(bh_result->b_page) ; + return ret ; + } + + // requested data are in direct item(s) + if (!(args & GET_BLOCK_READ_DIRECT)) { + // we are called by bmap. FIXME: we can not map block of file + // when it is stored in direct item(s) + pathrelse (&path); + if (p) + kunmap(bh_result->b_page) ; + return -ENOENT; + } + + // read file tail into part of page + offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1) ; + fs_gen = get_generation(inode->i_sb) ; + copy_item_head (&tmp_ih, ih); + + /* we only want to kmap if we are reading the tail into the page. + ** this is not the common case, so we don't kmap until we are + ** sure we need to. But, this means the item might move if + ** kmap schedules + */ + p = (char *)kmap(bh_result->b_page) ; + if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { + goto research; + } + p += offset ; + memset (p, 0, inode->i_sb->s_blocksize); + do { + if (!is_direct_le_ih (ih)) { + BUG (); + } + chars = le16_to_cpu (ih->ih_item_len) - path.pos_in_item; + memcpy (p, B_I_PITEM (bh, ih) + path.pos_in_item, chars); + p += chars; + + if (PATH_LAST_POSITION (&path) != (B_NR_ITEMS (bh) - 1)) + // we done, if read direct item is not the last item of + // node FIXME: we could try to check right delimiting key + // to see whether direct item continues in the right + // neighbor or rely on i_size + break; + + // update key to look for the next piece + set_cpu_key_k_offset (&key, cpu_key_k_offset (&key) + chars); + if (search_for_position_by_key (inode->i_sb, &key, &path) != POSITION_FOUND) + // we read something from tail, even if now we got IO_ERROR + break; + bh = get_bh (&path); + ih = get_ih (&path); + } while (1); + + pathrelse (&path); + + // FIXME: b_blocknr == 0 here. but b_data contains correct data + // from tail. ll_rw_block will skip uptodate buffers + bh_result->b_blocknr = 0 ; + bh_result->b_dev = inode->i_dev; + mark_buffer_uptodate (bh_result, 1); + bh_result->b_state |= (1UL << BH_Mapped); + kunmap(bh_result->b_page) ; + + return 0; +} + + +// this is called to create file map. So, _get_block_create_0 will not +// read direct item +int reiserfs_bmap (struct inode * inode, long block, + struct buffer_head * bh_result, int create) +{ + if (!file_capable (inode, block)) + return -EFBIG; + + lock_kernel() ; + /* do not read the direct item */ + _get_block_create_0 (inode, block, bh_result, 0) ; + unlock_kernel() ; + return 0; +} + +/* special version of get_block that is only used by grab_tail_page right +** now. It is sent to block_prepare_write, and when you try to get a +** block past the end of the file (or a block from a hole) it returns +** -ENOENT instead of a valid buffer. block_prepare_write expects to +** be able to do i/o on the buffers returned, unless an error value +** is also returned. +** +** So, this allows block_prepare_write to be used for reading a single block +** in a page. Where it does not produce a valid page for holes, or past the +** end of the file. This turns out to be exactly what we need for reading +** tails for conversion. +** +** The point of the wrapper is forcing a certain value for create, even +** though the VFS layer is calling this function with create==1. If you +** don't want to send create == GET_BLOCK_NO_HOLE to reiserfs_get_block, +** don't use this function. +*/ +static int reiserfs_get_block_create_0 (struct inode * inode, long block, + struct buffer_head * bh_result, int create) { + return reiserfs_get_block(inode, block, bh_result, GET_BLOCK_NO_HOLE) ; +} + +/* +** helper function for when reiserfs_get_block is called for a hole +** but the file tail is still in a direct item +** bh_result is the buffer head for the hole +** tail_offset is the offset of the start of the tail in the file +** +** This calls prepare_write, which will start a new transaction +** you should not be in a transaction, or have any paths held when you +** call this. +*/ +static int convert_tail_for_hole(struct inode *inode, + struct buffer_head *bh_result, + loff_t tail_offset) { + unsigned long index ; + unsigned long tail_end ; + unsigned long tail_start ; + struct page * tail_page ; + struct page * hole_page = bh_result->b_page ; + int retval = 0 ; + + if ((tail_offset & (bh_result->b_size - 1)) != 1) + return -EIO ; + + /* always try to read until the end of the block */ + tail_start = tail_offset & (PAGE_CACHE_SIZE - 1) ; + tail_end = (tail_start | (bh_result->b_size - 1)) + 1 ; + + index = tail_offset >> PAGE_CACHE_SHIFT ; + if (index != hole_page->index) { + tail_page = grab_cache_page(inode->i_mapping, index) ; + retval = PTR_ERR(tail_page) ; + if (IS_ERR(tail_page)) { + goto out ; + } + } else { + tail_page = hole_page ; + } + + /* we don't have to make sure the conversion did not happen while + ** we were locking the page because anyone that could convert + ** must first take i_sem. + ** + ** We must fix the tail page for writing because it might have buffers + ** that are mapped, but have a block number of 0. This indicates tail + ** data that has been read directly into the page, and block_prepare_write + ** won't trigger a get_block in this case. + */ + fix_tail_page_for_writing(tail_page) ; + retval = block_prepare_write(tail_page, tail_start, tail_end, + reiserfs_get_block) ; + if (retval) + goto unlock ; + + /* tail conversion might change the data in the page */ + flush_dcache_page(tail_page) ; + + retval = generic_commit_write(NULL, tail_page, tail_start, tail_end) ; + +unlock: + if (tail_page != hole_page) { + UnlockPage(tail_page) ; + page_cache_release(tail_page) ; + } +out: + return retval ; +} + +// +// initially this function was derived from ext2's analog and evolved +// as the prototype did. You'll need to look at the ext2 version to +// determine which parts are derivative, if any, understanding that +// there are only so many ways to code to a given interface. +// +int reiserfs_get_block (struct inode * inode, long block, + struct buffer_head * bh_result, int create) +{ + int repeat, retval; + unsigned long tag; + b_blocknr_t allocated_block_nr = 0;// b_blocknr_t is unsigned long + INITIALIZE_PATH(path); + int pos_in_item; + struct cpu_key key; + struct buffer_head * bh, * unbh = 0; + struct item_head * ih, tmp_ih; + __u32 * item; + int done; + int fs_gen; + int windex ; + struct reiserfs_transaction_handle th ; + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 ; + int version; + int transaction_started = 0 ; + loff_t new_offset = (block << inode->i_sb->s_blocksize_bits) + 1 ; + + /* bad.... */ + lock_kernel() ; + th.t_trans_id = 0 ; + version = inode_items_version (inode); + + if (!file_capable (inode, block)) { + unlock_kernel() ; + return -EFBIG; + } + + /* if !create, we aren't changing the FS, so we don't need to + ** log anything, so we don't need to start a transaction + */ + if (!(create & GET_BLOCK_CREATE)) { + int ret ; + /* find number of block-th logical block of the file */ + ret = _get_block_create_0 (inode, block, bh_result, + create | GET_BLOCK_READ_DIRECT) ; + unlock_kernel() ; + return ret; + } + + if (block < 0) { + unlock_kernel(); + return -EIO; + } + + prevent_flush_page_lock(bh_result->b_page, inode) ; + inode->u.reiserfs_i.i_pack_on_close = 1 ; + + windex = push_journal_writer("reiserfs_get_block") ; + + /* set the key of the first byte in the 'block'-th block of file */ + make_cpu_key (&key, inode, + (loff_t)block * inode->i_sb->s_blocksize + 1, // k_offset + TYPE_ANY, 3/*key length*/); + if ((new_offset + inode->i_sb->s_blocksize) >= inode->i_size) { + journal_begin(&th, inode->i_sb, jbegin_count) ; + transaction_started = 1 ; + } + research: + + retval = search_for_position_by_key (inode->i_sb, &key, &path); + if (retval == IO_ERROR) { + retval = -EIO; + goto failure; + } + + bh = get_bh (&path); + ih = get_ih (&path); + item = get_item (&path); + pos_in_item = path.pos_in_item; + + fs_gen = get_generation (inode->i_sb); + copy_item_head (&tmp_ih, ih); + + if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) { + /* we have to allocate block for the unformatted node */ + tag = find_tag (bh, ih, item, pos_in_item); + if (!transaction_started) { + pathrelse(&path) ; + journal_begin(&th, inode->i_sb, jbegin_count) ; + transaction_started = 1 ; + goto research ; + } + +#ifdef REISERFS_PREALLOCATE + repeat = reiserfs_new_unf_blocknrs2 (&th, inode, &allocated_block_nr, tag); +#else + repeat = reiserfs_new_unf_blocknrs (&th, &allocated_block_nr, tag); +#endif + + if (repeat == NO_DISK_SPACE) { + /* restart the transaction to give the journal a chance to free + ** some blocks. releases the path, so we have to go back to + ** research if we succeed on the second try + */ + restart_transaction(&th, inode, &path) ; +#ifdef REISERFS_PREALLOCATE + repeat = reiserfs_new_unf_blocknrs2 (&th, inode, &allocated_block_nr, tag); +#else + repeat = reiserfs_new_unf_blocknrs (&th, &allocated_block_nr, tag); +#endif + + if (repeat != NO_DISK_SPACE) { + goto research ; + } + retval = -ENOSPC; + goto failure; + } + + if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { + goto research; + } + } + + if (indirect_item_found (retval, ih)) { + /* 'block'-th block is in the file already (there is + corresponding cell in some indirect item). But it may be + zero unformatted node pointer (hole) */ + if (!item[pos_in_item]) { + /* use allocated block to plug the hole */ + reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ; + if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, bh) ; + goto research; + } + bh_result->b_state |= (1UL << BH_New); + item[pos_in_item] = cpu_to_le32 (allocated_block_nr); + journal_mark_dirty (&th, inode->i_sb, bh); + inode->i_blocks += (inode->i_sb->s_blocksize / 512) ; + reiserfs_update_sd(&th, inode) ; + } + set_block_dev_mapped(bh_result, le32_to_cpu (item[pos_in_item]), inode); + pathrelse (&path); +#ifdef REISERFS_CHECK + pop_journal_writer(windex) ; +#endif /* REISERFS_CHECK */ + if (transaction_started) + journal_end(&th, inode->i_sb, jbegin_count) ; + + allow_flush_page_lock(bh_result->b_page, inode) ; + unlock_kernel() ; + + /* the item was found, so new blocks were not added to the file + ** there is no need to make sure the inode is updated with this + ** transaction + */ + return 0; + } + + if (!transaction_started) { + /* if we don't pathrelse, we could vs-3050 on the buffer if + ** someone is waiting for it (they can't finish until the buffer + ** is released, we can start a new transaction until they finish) + */ + pathrelse(&path) ; + journal_begin(&th, inode->i_sb, jbegin_count) ; + transaction_started = 1 ; + goto research; + } + + /* desired position is not found or is in the direct item. We have + to append file with holes up to 'block'-th block converting + direct items to indirect one if necessary */ + done = 0; + do { + if (is_statdata_le_ih (ih)) { + __u32 unp = 0; + struct cpu_key tmp_key; + + /* indirect item has to be inserted */ + make_le_item_head (&tmp_ih, &key, version, 1, TYPE_INDIRECT, + UNFM_P_SIZE, 0/* free_space */); + + if (cpu_key_k_offset (&key) == 1) { + /* we are going to add 'block'-th block to the file. Use + allocated block for that */ + unp = cpu_to_le32 (allocated_block_nr); + set_block_dev_mapped (bh_result, allocated_block_nr, inode); + bh_result->b_state |= (1UL << BH_New); + done = 1; + } + tmp_key = key; // ;) + set_cpu_key_k_offset (&tmp_key, 1); + PATH_LAST_POSITION(&path) ++; + + retval = reiserfs_insert_item (&th, &path, &tmp_key, &tmp_ih, (char *)&unp); + if (retval) { + reiserfs_free_block (&th, allocated_block_nr); + +#ifdef REISERFS_PREALLOCATE + reiserfs_discard_prealloc (&th, inode); +#endif + goto failure; // retval == -ENOSPC or -EIO or -EEXIST + } + if (unp) + inode->i_blocks += inode->i_sb->s_blocksize / 512; + //mark_tail_converted (inode); + } else if (is_direct_le_ih (ih)) { + /* direct item has to be converted */ + loff_t tail_offset; + + tail_offset = ((le_ih_k_offset (ih) - 1) & ~(inode->i_sb->s_blocksize - 1)) + 1; + if (tail_offset == cpu_key_k_offset (&key)) { + /* direct item we just found fits into block we have + to map. Convert it into unformatted node: use + bh_result for the conversion */ + set_block_dev_mapped (bh_result, allocated_block_nr, inode); + unbh = bh_result; + done = 1; + } else { + /* we have to padd file tail stored in direct item(s) + up to block size and convert it to unformatted + node. FIXME: this should also get into page cache */ + + pathrelse(&path) ; + journal_end(&th, inode->i_sb, jbegin_count) ; + transaction_started = 0 ; + + retval = convert_tail_for_hole(inode, bh_result, tail_offset) ; + if (retval) { + printk("clm-6004: convert tail failed inode %lu, error %d\n", inode->i_ino, retval) ; + if (allocated_block_nr) + reiserfs_free_block (&th, allocated_block_nr); + goto failure ; + } + goto research ; + } + retval = direct2indirect (&th, inode, &path, unbh, tail_offset); + /* it is important the mark_buffer_uptodate is done after + ** the direct2indirect. The buffer might contain valid + ** data newer than the data on disk (read by readpage, changed, + ** and then sent here by writepage). direct2indirect needs + ** to know if unbh was already up to date, so it can decide + ** if the data in unbh needs to be replaced with data from + ** the disk + */ + mark_buffer_uptodate (unbh, 1); + if (retval) { + reiserfs_free_block (&th, allocated_block_nr); + +#ifdef REISERFS_PREALLOCATE + reiserfs_discard_prealloc (&th, inode); +#endif + goto failure; + } + /* we've converted the tail, so we must + ** flush unbh before the transaction commits + */ + reiserfs_add_page_to_flush_list(&th, inode, unbh) ; + + //inode->i_blocks += inode->i_sb->s_blocksize / 512; + //mark_tail_converted (inode); + } else { + /* append indirect item with holes if needed, when appending + pointer to 'block'-th block use block, which is already + allocated */ + struct cpu_key tmp_key; + struct unfm_nodeinfo un = {0, 0}; + +#ifdef CONFIG_REISERFS_CHECK + if (pos_in_item != le16_to_cpu (ih->ih_item_len) / UNFM_P_SIZE) + reiserfs_panic (inode->i_sb, "vs-: reiserfs_get_block: " + "invalid position for append"); +#endif + /* indirect item has to be appended, set up key of that position */ + make_cpu_key (&tmp_key, inode, + le_key_k_offset (version, &(ih->ih_key)) + op_bytes_number (ih, inode->i_sb->s_blocksize), + //pos_in_item * inode->i_sb->s_blocksize, + TYPE_INDIRECT, 3);// key type is unimportant + + if (cpu_key_k_offset (&tmp_key) == cpu_key_k_offset (&key)) { + /* we are going to add target block to the file. Use allocated + block for that */ + un.unfm_nodenum = cpu_to_le32 (allocated_block_nr); + set_block_dev_mapped (bh_result, allocated_block_nr, inode); + bh_result->b_state |= (1UL << BH_New); + done = 1; + } else { + /* paste hole to the indirect item */ + } + retval = reiserfs_paste_into_item (&th, &path, &tmp_key, (char *)&un, UNFM_P_SIZE); + if (retval) { + reiserfs_free_block (&th, allocated_block_nr); + +#ifdef REISERFS_PREALLOCATE + reiserfs_discard_prealloc (&th, inode); +#endif + goto failure; + } + if (un.unfm_nodenum) + inode->i_blocks += inode->i_sb->s_blocksize / 512; + //mark_tail_converted (inode); + } + + if (done == 1) + break; + + /* this loop could log more blocks than we had originally asked + ** for. So, we have to allow the transaction to end if it is + ** too big or too full. Update the inode so things are + ** consistent if we crash before the function returns + ** + ** release the path so that anybody waiting on the path before + ** ending their transaction will be able to continue. + */ + if (journal_transaction_should_end(&th, th.t_blocks_allocated)) { + restart_transaction(&th, inode, &path) ; + } + /* inserting indirect pointers for a hole can take a + ** long time. reschedule if needed + */ + if (current->need_resched) + schedule() ; + + retval = search_for_position_by_key (inode->i_sb, &key, &path); + if (retval == IO_ERROR) { + retval = -EIO; + goto failure; + } + if (retval == POSITION_FOUND) { + reiserfs_warning ("vs-: reiserfs_get_block: " + "%k should not be found", &key); + retval = -EEXIST; + pathrelse(&path) ; + goto failure; + } + bh = get_bh (&path); + ih = get_ih (&path); + item = get_item (&path); + pos_in_item = path.pos_in_item; + } while (1); + + + retval = 0; + reiserfs_check_path(&path) ; + + failure: + if (transaction_started) { + reiserfs_update_sd(&th, inode) ; + journal_end(&th, inode->i_sb, jbegin_count) ; + } + pop_journal_writer(windex) ; + allow_flush_page_lock(bh_result->b_page, inode) ; + unlock_kernel() ; + reiserfs_check_path(&path) ; + return retval; +} + + +// +// BAD: new directories have stat data of new type and all other items +// of old type. Version stored in the inode says about body items, so +// in update_stat_data we can not rely on inode, but have to check +// item version directly +// + +// called by read_inode +static void init_inode (struct inode * inode, struct path * path) +{ + struct buffer_head * bh; + struct item_head * ih; + __u32 rdev; + //int version = ITEM_VERSION_1; + + bh = PATH_PLAST_BUFFER (path); + ih = PATH_PITEM_HEAD (path); + + + copy_key (INODE_PKEY (inode), &(ih->ih_key)); + inode->i_generation = INODE_PKEY (inode)->k_dir_id; + inode->i_blksize = PAGE_SIZE; + + if (stat_data_v1 (ih)) { + struct stat_data_v1 * sd = (struct stat_data_v1 *)B_I_PITEM (bh, ih); + unsigned long blocks; + + inode_items_version (inode) = ITEM_VERSION_1; + inode->i_mode = le16_to_cpu (sd->sd_mode); + inode->i_nlink = le16_to_cpu (sd->sd_nlink); + inode->i_uid = le16_to_cpu (sd->sd_uid); + inode->i_gid = le16_to_cpu (sd->sd_gid); + inode->i_size = le32_to_cpu (sd->sd_size); + inode->i_atime = le32_to_cpu (sd->sd_atime); + inode->i_mtime = le32_to_cpu (sd->sd_mtime); + inode->i_ctime = le32_to_cpu (sd->sd_ctime); + + inode->i_blocks = le32_to_cpu (sd->u.sd_blocks); + blocks = (inode->i_size + 511) >> 9; + blocks = _ROUND_UP (blocks, inode->i_blksize >> 9); + if (inode->i_blocks > blocks) { + // there was a bug in <=3.5.23 when i_blocks could take negative + // values. Starting from 3.5.17 this value could even be stored in + // stat data. For such files we set i_blocks based on file + // size. Just 2 notes: this can be wrong for sparce files. On-disk value will be + // only updated if file's inode will ever change + inode->i_blocks = blocks; + } + + rdev = le32_to_cpu (sd->u.sd_rdev); + inode->u.reiserfs_i.i_first_direct_byte = le32_to_cpu (sd->sd_first_direct_byte); + } else { + // new stat data found, but object may have old items + // (directories and symlinks) + struct stat_data * sd = (struct stat_data *)B_I_PITEM (bh, ih); + + /* both old and new directories have old keys */ + //version = (S_ISDIR (sd->sd_mode) ? ITEM_VERSION_1 : ITEM_VERSION_2); + if (S_ISDIR (sd->sd_mode) || S_ISLNK (sd->sd_mode)) + inode_items_version (inode) = ITEM_VERSION_1; + else + inode_items_version (inode) = ITEM_VERSION_2; + inode->i_mode = le16_to_cpu (sd->sd_mode); + inode->i_nlink = le32_to_cpu (sd->sd_nlink); + inode->i_uid = le32_to_cpu (sd->sd_uid); + inode->i_size = le64_to_cpu (sd->sd_size); + inode->i_gid = le32_to_cpu (sd->sd_gid); + inode->i_mtime = le32_to_cpu (sd->sd_mtime); + inode->i_atime = le32_to_cpu (sd->sd_atime); + inode->i_ctime = le32_to_cpu (sd->sd_ctime); + inode->i_blocks = le32_to_cpu (sd->sd_blocks); + rdev = le32_to_cpu (sd->u.sd_rdev); + } + + /* nopack = 0, by default */ + inode->u.reiserfs_i.nopack = 0; + + pathrelse (path); + if (S_ISREG (inode->i_mode)) { + inode->i_op = &reiserfs_file_inode_operations; + inode->i_fop = &reiserfs_file_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations ; + } else if (S_ISDIR (inode->i_mode)) { + inode->i_op = &reiserfs_dir_inode_operations; + inode->i_fop = &reiserfs_dir_operations; + } else if (S_ISLNK (inode->i_mode)) { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + } else { + inode->i_blocks = 0; + init_special_inode(inode, inode->i_mode, rdev) ; + } +} + + +// update new stat data with inode fields +static void inode2sd (void * sd, struct inode * inode) +{ + struct stat_data * sd_v2 = (struct stat_data *)sd; + + sd_v2->sd_mode = cpu_to_le16 (inode->i_mode); + sd_v2->sd_nlink = cpu_to_le16 (inode->i_nlink); + sd_v2->sd_uid = cpu_to_le32 (inode->i_uid); + sd_v2->sd_size = cpu_to_le64 (inode->i_size); + sd_v2->sd_gid = cpu_to_le32 (inode->i_gid); + sd_v2->sd_mtime = cpu_to_le32 (inode->i_mtime); + sd_v2->sd_atime = cpu_to_le32 (inode->i_atime); + sd_v2->sd_ctime = cpu_to_le32 (inode->i_ctime); + sd_v2->sd_blocks = cpu_to_le32 (inode->i_blocks); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + sd_v2->u.sd_rdev = cpu_to_le32 (inode->i_rdev); +} + + +// used to copy inode's fields to old stat data +static void inode2sd_v1 (void * sd, struct inode * inode) +{ + struct stat_data_v1 * sd_v1 = (struct stat_data_v1 *)sd; + + sd_v1->sd_mode = cpu_to_le16 (inode->i_mode); + sd_v1->sd_uid = cpu_to_le16 (inode->i_uid); + sd_v1->sd_gid = cpu_to_le16 (inode->i_gid); + sd_v1->sd_nlink = cpu_to_le16 (inode->i_nlink); + sd_v1->sd_size = cpu_to_le32 (inode->i_size); + sd_v1->sd_atime = cpu_to_le32 (inode->i_atime); + sd_v1->sd_ctime = cpu_to_le32 (inode->i_ctime); + sd_v1->sd_mtime = cpu_to_le32 (inode->i_mtime); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) + sd_v1->u.sd_rdev = cpu_to_le32 (inode->i_rdev); + else + sd_v1->u.sd_blocks = cpu_to_le32 (inode->i_blocks); + + // Sigh. i_first_direct_byte is back + sd_v1->sd_first_direct_byte = cpu_to_le32 (inode->u.reiserfs_i.i_first_direct_byte); +} + + +/* NOTE, you must prepare the buffer head before sending it here, +** and then log it after the call +*/ +static void update_stat_data (struct path * path, struct inode * inode) +{ + struct buffer_head * bh; + struct item_head * ih; + + bh = PATH_PLAST_BUFFER (path); + ih = PATH_PITEM_HEAD (path); + + if (!is_statdata_le_ih (ih)) + reiserfs_panic (inode->i_sb, "vs-13065: update_stat_data: key %k, found item %h", + INODE_PKEY (inode), ih); + + if (stat_data_v1 (ih)) { + // path points to old stat data + inode2sd_v1 (B_I_PITEM (bh, ih), inode); + } else { + inode2sd (B_I_PITEM (bh, ih), inode); + } + + return; +} + + +void reiserfs_update_sd (struct reiserfs_transaction_handle *th, + struct inode * inode) +{ + struct cpu_key key; + INITIALIZE_PATH(path); + struct buffer_head *bh ; + int fs_gen ; + struct item_head *ih, tmp_ih ; + int retval; + + make_cpu_key (&key, inode, SD_OFFSET, TYPE_STAT_DATA, 3);//key type is unimportant + + for(;;) { + int pos; + /* look for the object's stat data */ + retval = search_item (inode->i_sb, &key, &path); + if (retval == IO_ERROR) { + reiserfs_warning ("vs-13050: reiserfs_update_sd: " + "i/o failure occurred trying to update %K stat data", + &key); + return; + } + if (retval == ITEM_NOT_FOUND) { + pos = PATH_LAST_POSITION (&path); + pathrelse(&path) ; + if (inode->i_nlink == 0) { + /*printk ("vs-13050: reiserfs_update_sd: i_nlink == 0, stat data not found\n");*/ + return; + } + reiserfs_warning ("vs-13060: reiserfs_update_sd: " + "stat data of object %k (nlink == %d) not found (pos %d)\n", + INODE_PKEY (inode), inode->i_nlink, pos); + reiserfs_check_path(&path) ; + return; + } + + /* sigh, prepare_for_journal might schedule. When it schedules the + ** FS might change. We have to detect that, and loop back to the + ** search if the stat data item has moved + */ + bh = get_bh(&path) ; + ih = get_ih(&path) ; + copy_item_head (&tmp_ih, ih); + fs_gen = get_generation (inode->i_sb); + reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ; + if (fs_changed (fs_gen, inode->i_sb) && item_moved(&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, bh) ; + continue ; /* Stat_data item has been moved after scheduling. */ + } + break; + } + update_stat_data (&path, inode); + journal_mark_dirty(th, th->t_super, bh) ; + pathrelse (&path); + return; +} + +void reiserfs_read_inode(struct inode *inode) { + make_bad_inode(inode) ; +} + + +// +// initially this function was derived from minix or ext2's analog and +// evolved as the prototype did +// + +/* looks for stat data in the tree, and fills up the fields of in-core + inode stat data fields */ +void reiserfs_read_inode2 (struct inode * inode, void *p) +{ + INITIALIZE_PATH (path_to_sd); + struct cpu_key key; + struct reiserfs_iget4_args *args = (struct reiserfs_iget4_args *)p ; + unsigned long dirino; + int retval; + + if (!p) { + make_bad_inode(inode) ; + return; + } + + dirino = args->objectid ; + + /* set version 1, version 2 could be used too, because stat data + key is the same in both versions */ + key.version = ITEM_VERSION_1; + key.on_disk_key.k_dir_id = dirino; + key.on_disk_key.k_objectid = inode->i_ino; + key.on_disk_key.u.k_offset_v1.k_offset = SD_OFFSET; + key.on_disk_key.u.k_offset_v1.k_uniqueness = SD_UNIQUENESS; + + /* look for the object's stat data */ + retval = search_item (inode->i_sb, &key, &path_to_sd); + if (retval == IO_ERROR) { + reiserfs_warning ("vs-13070: reiserfs_read_inode2: " + "i/o failure occurred trying to find stat data of %K\n", + &key); + make_bad_inode(inode) ; + return; + } + if (retval != ITEM_FOUND) { + reiserfs_warning ("vs-13042: reiserfs_read_inode2: %K not found\n", &key); + pathrelse (&path_to_sd); + make_bad_inode(inode) ; + return; + } + + init_inode (inode, &path_to_sd); + reiserfs_check_path(&path_to_sd) ; /* init inode should be relsing */ + +} + + +struct inode * reiserfs_iget (struct super_block * s, struct cpu_key * key) +{ + struct inode * inode; + struct reiserfs_iget4_args args ; + + args.objectid = key->on_disk_key.k_dir_id ; + inode = iget4 (s, key->on_disk_key.k_objectid, 0, (void *)(&args)); + if (!inode) + return inode ; + + // if (comp_short_keys (INODE_PKEY (inode), key)) { + if (is_bad_inode (inode)) { + reiserfs_warning ("vs-13048: reiserfs_iget: " + "bad_inode. Stat data of (%lu %lu) not found\n", + key->on_disk_key.k_dir_id, key->on_disk_key.k_objectid); + iput (inode); + inode = 0; + } + return inode; +} + + +// +// initially this function was derived from minix or ext2's analog and +// evolved as the prototype did +// +/* looks for stat data, then copies fields to it, marks the buffer + containing stat data as dirty */ +/* reiserfs inodes are never really dirty, since the dirty inode call +** always logs them. This call allows the VFS inode marking routines +** to properly mark inodes for datasync and such, but only actually +** does something when called for a synchronous update. +*/ +void reiserfs_write_inode (struct inode * inode, int do_sync) { + struct reiserfs_transaction_handle th ; + int jbegin_count = 1 ; + + if (inode->i_sb->s_flags & MS_RDONLY) { + reiserfs_warning("clm-6005: writing inode %lu on readonly FS\n", + inode->i_ino) ; + return ; + } + if (do_sync) { + lock_kernel() ; + journal_begin(&th, inode->i_sb, jbegin_count) ; + reiserfs_update_sd (&th, inode); + journal_end_sync(&th, inode->i_sb, jbegin_count) ; + unlock_kernel() ; + } +} + +void reiserfs_dirty_inode (struct inode * inode) { + struct reiserfs_transaction_handle th ; + + if (inode->i_sb->s_flags & MS_RDONLY) { + reiserfs_warning("clm-6006: writing inode %lu on readonly FS\n", + inode->i_ino) ; + return ; + } + lock_kernel() ; + journal_begin(&th, inode->i_sb, 1) ; + reiserfs_update_sd (&th, inode); + journal_end(&th, inode->i_sb, 1) ; + unlock_kernel() ; +} + + +/* FIXME: no need any more. right? */ +int reiserfs_sync_inode (struct reiserfs_transaction_handle *th, struct inode * inode) +{ + int err = 0; + + reiserfs_update_sd (th, inode); + return err; +} + + +/* stat data of new object is inserted already, this inserts the item + containing "." and ".." entries */ +static int reiserfs_new_directory (struct reiserfs_transaction_handle *th, + struct item_head * ih, struct path * path, const struct inode * dir) +{ + struct super_block * sb = th->t_super; + char empty_dir [EMPTY_DIR_SIZE]; + char * body = empty_dir; + struct cpu_key key; + int retval; + + _make_cpu_key (&key, ITEM_VERSION_1, le32_to_cpu (ih->ih_key.k_dir_id), + le32_to_cpu (ih->ih_key.k_objectid), DOT_OFFSET, TYPE_DIRENTRY, 3/*key length*/); + + /* compose item head for new item. Directories consist of items of + old type (ITEM_VERSION_1). Do not set key (second arg is 0), it + is done by reiserfs_new_inode */ + if (old_format_only (sb)) { + make_le_item_head (ih, 0, ITEM_VERSION_1, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE_V1, 2); + + make_empty_dir_item_v1 (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid, + le32_to_cpu (INODE_PKEY (dir)->k_dir_id), + le32_to_cpu (INODE_PKEY (dir)->k_objectid)); + } else { + make_le_item_head (ih, 0, ITEM_VERSION_1, DOT_OFFSET, TYPE_DIRENTRY, EMPTY_DIR_SIZE, 2); + + make_empty_dir_item (body, ih->ih_key.k_dir_id, ih->ih_key.k_objectid, + le32_to_cpu (INODE_PKEY (dir)->k_dir_id), + le32_to_cpu (INODE_PKEY (dir)->k_objectid)); + } + + /* look for place in the tree for new item */ + retval = search_item (sb, &key, path); + if (retval == IO_ERROR) { + reiserfs_warning ("vs-13080: reiserfs_new_directory: " + "i/o failure occured creating new directory\n"); + return -EIO; + } + if (retval == ITEM_FOUND) { + pathrelse (path); + reiserfs_warning ("vs-13070: reiserfs_new_directory: " + "object with this key exists (%k)", &(ih->ih_key)); + return -EEXIST; + } + + /* insert item, that is empty directory item */ + return reiserfs_insert_item (th, path, &key, ih, body); +} + + +/* stat data of object has been inserted, this inserts the item + containing the body of symlink */ +static int reiserfs_new_symlink (struct reiserfs_transaction_handle *th, + struct item_head * ih, + struct path * path, const char * symname, int item_len) +{ + struct super_block * sb = th->t_super; + struct cpu_key key; + int retval; + + _make_cpu_key (&key, ITEM_VERSION_1, + le32_to_cpu (ih->ih_key.k_dir_id), + le32_to_cpu (ih->ih_key.k_objectid), + 1, TYPE_DIRECT, 3/*key length*/); + + make_le_item_head (ih, 0, ITEM_VERSION_1, 1, TYPE_DIRECT, item_len, 0/*free_space*/); + + /* look for place in the tree for new item */ + retval = search_item (sb, &key, path); + if (retval == IO_ERROR) { + reiserfs_warning ("vs-13080: reiserfs_new_symlinik: " + "i/o failure occured creating new symlink\n"); + return -EIO; + } + if (retval == ITEM_FOUND) { + pathrelse (path); + reiserfs_warning ("vs-13080: reiserfs_new_symlink: " + "object with this key exists (%k)", &(ih->ih_key)); + return -EEXIST; + } + + /* insert item, that is body of symlink */ + return reiserfs_insert_item (th, path, &key, ih, symname); +} + + +/* inserts the stat data into the tree, and then calls + reiserfs_new_directory (to insert ".", ".." item if new object is + directory) or reiserfs_new_symlink (to insert symlink body if new + object is symlink) or nothing (if new object is regular file) */ +struct inode * reiserfs_new_inode (struct reiserfs_transaction_handle *th, + const struct inode * dir, int mode, + const char * symname, + int i_size, /* 0 for regular, EMTRY_DIR_SIZE for dirs, + strlen (symname) for symlinks)*/ + struct dentry *dentry, struct inode *inode, int * err) +{ + struct super_block * sb; + INITIALIZE_PATH (path_to_key); + struct cpu_key key; + struct item_head ih; + struct stat_data sd; + int retval; + + if (!dir || !dir->i_nlink) { + *err = -EPERM; + iput(inode) ; + return NULL; + } + + sb = dir->i_sb; + inode->i_sb = sb; + inode->i_flags = 0;//inode->i_sb->s_flags; + + /* item head of new item */ + ih.ih_key.k_dir_id = INODE_PKEY (dir)->k_objectid; + ih.ih_key.k_objectid = cpu_to_le32 (reiserfs_get_unused_objectid (th)); + if (!ih.ih_key.k_objectid) { + iput(inode) ; + *err = -ENOMEM; + return NULL; + } + if (old_format_only (sb)) + make_le_item_head (&ih, 0, ITEM_VERSION_1, SD_OFFSET, TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT); + else + make_le_item_head (&ih, 0, ITEM_VERSION_2, SD_OFFSET, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); + + + /* key to search for correct place for new stat data */ + _make_cpu_key (&key, ITEM_VERSION_2, le32_to_cpu (ih.ih_key.k_dir_id), + le32_to_cpu (ih.ih_key.k_objectid), SD_OFFSET, TYPE_STAT_DATA, 3/*key length*/); + + /* find proper place for inserting of stat data */ + retval = search_item (sb, &key, &path_to_key); + if (retval == IO_ERROR) { + iput (inode); + *err = -EIO; + return NULL; + } + if (retval == ITEM_FOUND) { + pathrelse (&path_to_key); + iput (inode); + *err = -EEXIST; + return NULL; + } + + /* fill stat data */ + inode->i_mode = mode; + inode->i_nlink = (S_ISDIR (mode) ? 2 : 1); + inode->i_uid = current->fsuid; + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + inode->i_mode |= S_ISGID; + } else + inode->i_gid = current->fsgid; + + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_size = i_size; + inode->i_blocks = (inode->i_size + 511) >> 9; + inode->u.reiserfs_i.i_first_direct_byte = S_ISLNK(mode) ? 1 : + U32_MAX/*NO_BYTES_IN_DIRECT_ITEM*/; + + if (old_format_only (sb)) + inode2sd_v1 (&sd, inode); + else + inode2sd (&sd, inode); + + // these do not go to on-disk stat data + inode->i_ino = le32_to_cpu (ih.ih_key.k_objectid); + inode->i_blksize = PAGE_SIZE; + inode->i_dev = sb->s_dev; + + // store in in-core inode the key of stat data and version all + // object items will have (directory items will have old offset + // format, other new objects will consist of new items) + memcpy (INODE_PKEY (inode), &(ih.ih_key), KEY_SIZE); + if (old_format_only (sb) || S_ISDIR(mode) || S_ISLNK(mode)) + inode_items_version (inode) = ITEM_VERSION_1; + else + inode_items_version (inode) = ITEM_VERSION_2; + + /* insert the stat data into the tree */ + retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, (char *)(&sd)); + if (retval) { + iput (inode); + *err = retval; + reiserfs_check_path(&path_to_key) ; + return NULL; + } + + if (S_ISDIR(mode)) { + /* insert item with "." and ".." */ + retval = reiserfs_new_directory (th, &ih, &path_to_key, dir); + } + + if (S_ISLNK(mode)) { + /* insert body of symlink */ + if (!old_format_only (sb)) + i_size = ROUND_UP(i_size); + retval = reiserfs_new_symlink (th, &ih, &path_to_key, symname, i_size); + } + if (retval) { + inode->i_nlink = 0; + iput (inode); + *err = retval; + reiserfs_check_path(&path_to_key) ; + return NULL; + } + + /* not a perfect generation count, as object ids can be reused, but this + ** is as good as reiserfs can do right now + */ + inode->i_generation = INODE_PKEY (inode)->k_dir_id; + insert_inode_hash (inode); + // we do not mark inode dirty: on disk content matches to the + // in-core one + reiserfs_check_path(&path_to_key) ; + + return inode; +} + +/* +** finds the tail page in the page cache, +** reads the last block in. +** +** On success, page_result is set to a locked, pinned page, and bh_result +** is set to an up to date buffer for the last block in the file. returns 0. +** +** tail conversion is not done, so bh_result might not be valid for writing +** check buffer_mapped(bh_result) and bh_result->b_blocknr != 0 before +** trying to write the block. +** +** on failure, nonzero is returned, page_result and bh_result are untouched. +*/ +static int grab_tail_page(struct inode *p_s_inode, + struct page **page_result, + struct buffer_head **bh_result) { + + /* we want the page with the last byte in the file, + ** not the page that will hold the next byte for appending + */ + unsigned long index = (p_s_inode->i_size-1) >> PAGE_CACHE_SHIFT ; + unsigned long pos = 0 ; + unsigned long start = 0 ; + unsigned long blocksize = p_s_inode->i_sb->s_blocksize ; + unsigned long offset = (p_s_inode->i_size) & (PAGE_CACHE_SIZE - 1) ; + struct buffer_head *bh ; + struct buffer_head *head ; + struct page * page ; + int error ; + + /* we know that we are only called with inode->i_size > 0. + ** we also know that a file tail can never be as big as a block + ** If i_size % blocksize == 0, our file is currently block aligned + ** and it won't need converting or zeroing after a truncate. + */ + if ((offset & (blocksize - 1)) == 0) { + return -ENOENT ; + } + page = grab_cache_page(p_s_inode->i_mapping, index) ; + error = PTR_ERR(page) ; + if (IS_ERR(page)) { + goto out ; + } + /* start within the page of the last block in the file */ + start = (offset / blocksize) * blocksize ; + + error = block_prepare_write(page, start, offset, + reiserfs_get_block_create_0) ; + if (error) + goto unlock ; + + kunmap(page) ; /* mapped by block_prepare_write */ + + head = page->buffers ; + bh = head; + do { + if (pos >= start) { + break ; + } + bh = bh->b_this_page ; + pos += blocksize ; + } while(bh != head) ; + + if (!buffer_uptodate(bh)) { + /* note, this should never happen, prepare_write should + ** be taking care of this for us. If the buffer isn't up to date, + ** I've screwed up the code to find the buffer, or the code to + ** call prepare_write + */ + reiserfs_warning("clm-6000: error reading block %lu on dev %s\n", + bh->b_blocknr, kdevname(bh->b_dev)) ; + error = -EIO ; + goto unlock ; + } + *bh_result = bh ; + *page_result = page ; + +out: + return error ; + +unlock: + UnlockPage(page) ; + page_cache_release(page) ; + return error ; +} + +/* +** vfs version of truncate file. Must NOT be called with +** a transaction already started. +** +** some code taken from block_truncate_page +*/ +void reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) { + struct reiserfs_transaction_handle th ; + int windex ; + + /* we want the offset for the first byte after the end of the file */ + unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1) ; + unsigned blocksize = p_s_inode->i_sb->s_blocksize ; + unsigned length ; + struct page *page = NULL ; + int error ; + struct buffer_head *bh = NULL ; + + if (p_s_inode->i_size > 0) { + if ((error = grab_tail_page(p_s_inode, &page, &bh))) { + // -ENOENT means we truncated past the end of the file, + // and get_block_create_0 could not find a block to read in, + // which is ok. + if (error != -ENOENT) + reiserfs_warning("clm-6001: grab_tail_page failed %d\n", error); + page = NULL ; + bh = NULL ; + } + } + + /* so, if page != NULL, we have a buffer head for the offset at + ** the end of the file. if the bh is mapped, and bh->b_blocknr != 0, + ** then we have an unformatted node. Otherwise, we have a direct item, + ** and no zeroing is required. We zero after the truncate, because the + ** truncate might pack the item anyway (it will unmap bh if it packs). + */ + prevent_flush_page_lock(page, p_s_inode) ; + journal_begin(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 ) ; + windex = push_journal_writer("reiserfs_vfs_truncate_file") ; + reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ; + pop_journal_writer(windex) ; + journal_end(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 ) ; + allow_flush_page_lock(page, p_s_inode) ; + + if (page && buffer_mapped(bh) && bh->b_blocknr != 0) { + length = offset & (blocksize - 1) ; + /* if we are not on a block boundary */ + if (length) { + length = blocksize - length ; + memset((char *)kmap(page) + offset, 0, length) ; + flush_dcache_page(page) ; + kunmap(page) ; + mark_buffer_dirty(bh) ; + } + } + + if (page) { + UnlockPage(page) ; + page_cache_release(page) ; + } + return ; +} + +static int map_block_for_writepage(struct inode *inode, + struct buffer_head *bh_result, + unsigned long block) { + struct reiserfs_transaction_handle th ; + int fs_gen ; + struct item_head tmp_ih ; + struct item_head *ih ; + struct buffer_head *bh ; + __u32 *item ; + struct cpu_key key ; + INITIALIZE_PATH(path) ; + int pos_in_item ; + int jbegin_count = JOURNAL_PER_BALANCE_CNT ; + loff_t byte_offset = (block << inode->i_sb->s_blocksize_bits) + 1 ; + int retval ; + int use_get_block = 0 ; + int bytes_copied = 0 ; + int copy_size ; + +start_over: + lock_kernel() ; + prevent_flush_page_lock(bh_result->b_page, inode) ; + journal_begin(&th, inode->i_sb, jbegin_count) ; + + make_cpu_key(&key, inode, byte_offset, TYPE_ANY, 3) ; + +research: + retval = search_for_position_by_key(inode->i_sb, &key, &path) ; + if (retval != POSITION_FOUND) { + use_get_block = 1; + goto out ; + } + + bh = get_bh(&path) ; + ih = get_ih(&path) ; + item = get_item(&path) ; + pos_in_item = path.pos_in_item ; + + /* we've found an unformatted node */ + if (indirect_item_found(retval, ih)) { + if (bytes_copied > 0) { + reiserfs_warning("clm-6002: bytes_copied %d\n", bytes_copied) ; + } + if (!item[pos_in_item]) { + /* crap, we are writing to a hole */ + use_get_block = 1; + goto out ; + } + set_block_dev_mapped(bh_result, le32_to_cpu(item[pos_in_item]), inode); + } else if (is_direct_le_ih(ih)) { + char *p ; + p = page_address(bh_result->b_page) ; + p += (byte_offset -1) & (PAGE_CACHE_SIZE - 1) ; + copy_size = le16_to_cpu(ih->ih_item_len) - pos_in_item ; + + fs_gen = get_generation(inode->i_sb) ; + copy_item_head(&tmp_ih, ih) ; + reiserfs_prepare_for_journal(inode->i_sb, bh, 1) ; + if (fs_changed (fs_gen, inode->i_sb) && item_moved (&tmp_ih, &path)) { + reiserfs_restore_prepared_buffer(inode->i_sb, bh) ; + goto research; + } + + memcpy( B_I_PITEM(bh, ih) + pos_in_item, p + bytes_copied, copy_size) ; + + journal_mark_dirty(&th, inode->i_sb, bh) ; + bytes_copied += copy_size ; + set_block_dev_mapped(bh_result, 0, inode); + + /* are there still bytes left? */ + if (bytes_copied < bh_result->b_size && + (byte_offset + bytes_copied) < inode->i_size) { + set_cpu_key_k_offset(&key, cpu_key_k_offset(&key) + copy_size) ; + goto research ; + } + } else { + reiserfs_warning("clm-6003: bad item inode %lu, device %s\n", inode->i_ino, kdevname(inode->i_sb->s_dev)) ; + retval = -EIO ; + goto out ; + } + retval = 0 ; + +out: + pathrelse(&path) ; + journal_end(&th, inode->i_sb, jbegin_count) ; + allow_flush_page_lock(bh_result->b_page, inode) ; + unlock_kernel() ; + + /* this is where we fill in holes in the file. */ + if (use_get_block) { + kmap(bh_result->b_page) ; + retval = reiserfs_get_block(inode, block, bh_result, 1) ; + kunmap(bh_result->b_page) ; + if (!retval) { + if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) { + /* get_block failed to find a mapped unformatted node. */ + use_get_block = 0 ; + goto start_over ; + } + } + } + return retval ; +} + +/* helper func to get a buffer head ready for writepage to send to +** ll_rw_block +*/ +static inline void submit_bh_for_writepage(struct buffer_head **bhp, int nr) { + struct buffer_head *bh ; + int i; + for(i = 0 ; i < nr ; i++) { + bh = bhp[i] ; + lock_buffer(bh) ; + atomic_inc(&bh->b_count) ; /* async end_io handler decs this */ + set_buffer_async_io(bh) ; + /* submit_bh doesn't care if the buffer is dirty, but nobody + ** later on in the call chain will be cleaning it. So, we + ** clean the buffer here, it still gets written either way. + */ + clear_bit(BH_Dirty, &bh->b_state) ; + set_bit(BH_Uptodate, &bh->b_state) ; + submit_bh(WRITE, bh) ; + } +} + +static int reiserfs_write_full_page(struct page *page) { + struct inode *inode = page->mapping->host ; + unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ; + unsigned last_offset = PAGE_CACHE_SIZE; + int error = 0; + unsigned long block ; + unsigned cur_offset = 0 ; + struct buffer_head *head, *bh ; + int partial = 0 ; + struct buffer_head *arr[PAGE_CACHE_SIZE/512] ; + int nr = 0 ; + + if (!page->buffers) { + block_prepare_write(page, 0, 0, NULL) ; + kunmap(page) ; + } + /* last page in the file, zero out any contents past the + ** last byte in the file + */ + if (page->index >= end_index) { + last_offset = inode->i_size & (PAGE_CACHE_SIZE - 1) ; + /* no file contents in this page */ + if (page->index >= end_index + 1 || !last_offset) { + error = -EIO ; + goto fail ; + } + memset((char *)kmap(page)+last_offset, 0, PAGE_CACHE_SIZE-last_offset) ; + flush_dcache_page(page) ; + kunmap(page) ; + } + head = page->buffers ; + bh = head ; + block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits) ; + do { + /* if this offset in the page is outside the file */ + if (cur_offset >= last_offset) { + if (!buffer_uptodate(bh)) + partial = 1 ; + } else { + /* fast path, buffer mapped to an unformatted node */ + if (buffer_mapped(bh) && bh->b_blocknr != 0) { + arr[nr++] = bh ; + } else { + /* buffer not mapped yet, or points to a direct item. + ** search and dirty or log + */ + if ((error = map_block_for_writepage(inode, bh, block))) { + goto fail ; + } + /* map_block_for_writepage either found an unformatted node + ** and mapped it for us, or it found a direct item + ** and logged the changes. + */ + if (buffer_mapped(bh) && bh->b_blocknr != 0) { + arr[nr++] = bh ; + } + } + } + bh = bh->b_this_page ; + cur_offset += bh->b_size ; + block++ ; + } while(bh != head) ; + + /* if this page only had a direct item, it is very possible for + ** nr == 0 without there being any kind of error. + */ + if (nr) { + submit_bh_for_writepage(arr, nr) ; + } else { + UnlockPage(page) ; + } + if (!partial) + SetPageUptodate(page) ; + + return 0 ; + +fail: + if (nr) { + submit_bh_for_writepage(arr, nr) ; + } else { + UnlockPage(page) ; + } + ClearPageUptodate(page) ; + return error ; +} + +// +// this is exactly what 2.3.99-pre9's ext2_readpage is +// +static int reiserfs_readpage (struct file *f, struct page * page) +{ + return block_read_full_page (page, reiserfs_get_block); +} + + +// +// modified from ext2_writepage is +// +static int reiserfs_writepage (struct page * page) +{ + struct inode *inode = page->mapping->host ; + reiserfs_wait_on_write_block(inode->i_sb) ; + return reiserfs_write_full_page(page) ; +} + + +// +// from ext2_prepare_write, but modified +// +int reiserfs_prepare_write(struct file *f, struct page *page, unsigned from, unsigned to) { + struct inode *inode = page->mapping->host ; + reiserfs_wait_on_write_block(inode->i_sb) ; + fix_tail_page_for_writing(page) ; + return block_prepare_write(page, from, to, reiserfs_get_block) ; +} + + +// +// this is exactly what 2.3.99-pre9's ext2_bmap is +// +static int reiserfs_aop_bmap(struct address_space *as, long block) { + return generic_block_bmap(as, block, reiserfs_bmap) ; +} + + +static int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to) { + struct inode *inode = page->mapping->host ; + int ret ; + struct reiserfs_transaction_handle th ; + + reiserfs_wait_on_write_block(inode->i_sb) ; + prevent_flush_page_lock(page, inode) ; + ret = generic_commit_write(f, page, from, to) ; + /* we test for O_SYNC here so we can commit the transaction + ** for any packed tails the file might have had + */ + if (f->f_flags & O_SYNC) { + journal_begin(&th, inode->i_sb, 1) ; + reiserfs_prepare_for_journal(inode->i_sb, + SB_BUFFER_WITH_SB(inode->i_sb), 1) ; + journal_mark_dirty(&th, inode->i_sb, SB_BUFFER_WITH_SB(inode->i_sb)) ; + journal_end_sync(&th, inode->i_sb, 1) ; + } + allow_flush_page_lock(page, inode) ; + return ret ; +} + +struct address_space_operations reiserfs_address_space_operations = { + writepage: reiserfs_writepage, + readpage: reiserfs_readpage, + sync_page: block_sync_page, + prepare_write: reiserfs_prepare_write, + commit_write: reiserfs_commit_write, + bmap: reiserfs_aop_bmap +} ; diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/ioctl.c linux/fs/reiserfs/ioctl.c --- v2.4.0/linux/fs/reiserfs/ioctl.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/ioctl.c Mon Jan 15 12:42:32 2001 @@ -0,0 +1,101 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include + +#else + +#include "nokernel.h" + +#endif + +/* +** reiserfs_ioctl - handler for ioctl for inode +** supported commands: +** 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect +** and prevent packing file (argument arg has to be non-zero) +** 2) That's all for a while ... +*/ +int reiserfs_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case REISERFS_IOC_UNPACK: + if (arg) + return reiserfs_unpack (inode, filp); + + default: + return -ENOTTY; + } +} + +/* +** reiserfs_unpack +** Function try to convert tail from direct item into indirect. +** It set up nopack attribute in the inode.u.reiserfs_i.nopack +*/ +int reiserfs_unpack (struct inode * inode, struct file * filp) +{ + int retval = 0; + int index ; + struct page *page ; + unsigned long write_from ; + unsigned long blocksize = inode->i_sb->s_blocksize ; + + if (inode->i_size == 0) { + return -EINVAL ; + } + /* ioctl already done */ + if (inode->u.reiserfs_i.nopack) { + return 0 ; + } + lock_kernel(); + + /* we need to make sure nobody is changing the file size beneath + ** us + */ + down(&inode->i_sem) ; + + write_from = inode->i_size & (blocksize - 1) ; + /* if we are on a block boundary, we are already unpacked. */ + if ( write_from == 0) { + inode->u.reiserfs_i.nopack = 1; + goto out ; + } + + /* we unpack by finding the page with the tail, and calling + ** reiserfs_prepare_write on that page. This will force a + ** reiserfs_get_block to unpack the tail for us. + */ + index = inode->i_size >> PAGE_CACHE_SHIFT ; + page = grab_cache_page(inode->i_mapping, index) ; + retval = PTR_ERR(page) ; + if (IS_ERR(page)) { + goto out ; + } + retval = reiserfs_prepare_write(NULL, page, write_from, blocksize) ; + if (retval) + goto out_unlock ; + + /* conversion can change page contents, must flush */ + flush_dcache_page(page) ; + inode->u.reiserfs_i.nopack = 1; + kunmap(page) ; /* mapped by prepare_write */ + +out_unlock: + UnlockPage(page) ; + page_cache_release(page) ; + +out: + up(&inode->i_sem) ; + unlock_kernel(); + return retval; +} diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/item_ops.c linux/fs/reiserfs/item_ops.c --- v2.4.0/linux/fs/reiserfs/item_ops.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/item_ops.c Mon Jan 15 12:42:32 2001 @@ -0,0 +1,718 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#ifdef __KERNEL__ + +#include +#include + +#else + +#include "nokernel.h" + +#endif + + +// this contains item handlers for old item types: sd, direct, +// indirect, directory + +/* and where are the comments? how about saying where we can find an + explanation of each item handler method? -Hans */ + +////////////////////////////////////////////////////////////////////////////// +// stat data functions +// +static int sd_bytes_number (struct item_head * ih, int block_size) +{ + return 0; +} + +static void sd_decrement_key (struct cpu_key * key) +{ + key->on_disk_key.k_objectid --; + set_cpu_key_k_type (key, TYPE_ANY); + set_cpu_key_k_offset(key, (loff_t)(-1)); +} + +static int sd_is_left_mergeable (struct key * key, unsigned long bsize) +{ + return 0; +} + + + +static char * print_time (time_t t) +{ + static char timebuf[256]; + +#ifndef __KERNEL__ +// struct tm *loctime; +// loctime = localtime (&t); + sprintf (timebuf, "%s", asctime (localtime (&t))); + timebuf[strlen (timebuf) - 1] = 0; +#else + sprintf (timebuf, "%ld", t); +#endif + return timebuf; +} + + +static void sd_print_item (struct item_head * ih, char * item) +{ + printk ("\tmode | size | nlinks | first direct | mtime\n"); + if (stat_data_v1 (ih)) { + struct stat_data_v1 * sd = (struct stat_data_v1 *)item; + + printk ("\t0%-6o | %6u | %2u | %d | %s\n", sd->sd_mode, sd->sd_size, + sd->sd_nlink, sd->sd_first_direct_byte, print_time (sd->sd_mtime)); + } else { + struct stat_data * sd = (struct stat_data *)item; + + printk ("\t0%-6o | %6Lu | %2u | %d | %s\n", sd->sd_mode, (unsigned long long)(sd->sd_size), + sd->sd_nlink, sd->u.sd_rdev, print_time (sd->sd_mtime)); + } +} + +static void sd_check_item (struct item_head * ih, char * item) +{ + // FIXME: type something here! +} + + +static int sd_create_vi (struct virtual_node * vn, + struct virtual_item * vi, + int is_affected, + int insert_size) +{ + vi->vi_index = TYPE_STAT_DATA; + //vi->vi_type |= VI_TYPE_STAT_DATA;// not needed? + return 0; +} + + +static int sd_check_left (struct virtual_item * vi, int free, + int start_skip, int end_skip) +{ + if (start_skip || end_skip) + BUG (); + return -1; +} + + +static int sd_check_right (struct virtual_item * vi, int free) +{ + return -1; +} + +static int sd_part_size (struct virtual_item * vi, int first, int count) +{ + if (count) + BUG (); + return 0; +} + +static int sd_unit_num (struct virtual_item * vi) +{ + return vi->vi_item_len - IH_SIZE; +} + + +static void sd_print_vi (struct virtual_item * vi) +{ + reiserfs_warning ("STATDATA, index %d, type 0x%x, %h\n", + vi->vi_index, vi->vi_type, vi->vi_ih); +} + +struct item_operations stat_data_ops = { + sd_bytes_number, + sd_decrement_key, + sd_is_left_mergeable, + sd_print_item, + sd_check_item, + + sd_create_vi, + sd_check_left, + sd_check_right, + sd_part_size, + sd_unit_num, + sd_print_vi +}; + + + +////////////////////////////////////////////////////////////////////////////// +// direct item functions +// +static int direct_bytes_number (struct item_head * ih, int block_size) +{ + return le16_to_cpu (ih->ih_item_len); +} + + +// FIXME: this should probably switch to indirect as well +static void direct_decrement_key (struct cpu_key * key) +{ + cpu_key_k_offset_dec (key); + if (cpu_key_k_offset (key) == 0) + set_cpu_key_k_type (key, TYPE_STAT_DATA); +} + + +static int direct_is_left_mergeable (struct key * key, unsigned long bsize) +{ + int version = le_key_version (key); + return ((le_key_k_offset (version, key) & (bsize - 1)) != 1); +} + + +static void direct_print_item (struct item_head * ih, char * item) +{ + int j = 0; + +// return; + printk ("\""); + while (j < ih->ih_item_len) + printk ("%c", item[j++]); + printk ("\"\n"); +} + + +static void direct_check_item (struct item_head * ih, char * item) +{ + // FIXME: type something here! +} + + +static int direct_create_vi (struct virtual_node * vn, + struct virtual_item * vi, + int is_affected, + int insert_size) +{ + vi->vi_index = TYPE_DIRECT; + //vi->vi_type |= VI_TYPE_DIRECT; + return 0; +} + +static int direct_check_left (struct virtual_item * vi, int free, + int start_skip, int end_skip) +{ + int bytes; + + bytes = free - free % 8; + return bytes ?: -1; +} + + +static int direct_check_right (struct virtual_item * vi, int free) +{ + return direct_check_left (vi, free, 0, 0); +} + +static int direct_part_size (struct virtual_item * vi, int first, int count) +{ + return count; +} + + +static int direct_unit_num (struct virtual_item * vi) +{ + return vi->vi_item_len - IH_SIZE; +} + + +static void direct_print_vi (struct virtual_item * vi) +{ + reiserfs_warning ("DIRECT, index %d, type 0x%x, %h\n", + vi->vi_index, vi->vi_type, vi->vi_ih); +} + +struct item_operations direct_ops = { + direct_bytes_number, + direct_decrement_key, + direct_is_left_mergeable, + direct_print_item, + direct_check_item, + + direct_create_vi, + direct_check_left, + direct_check_right, + direct_part_size, + direct_unit_num, + direct_print_vi +}; + + + +////////////////////////////////////////////////////////////////////////////// +// indirect item functions +// + +static int indirect_bytes_number (struct item_head * ih, int block_size) +{ + return le16_to_cpu (ih->ih_item_len) / UNFM_P_SIZE * block_size; //- get_ih_free_space (ih); +} + + +// decrease offset, if it becomes 0, change type to stat data +static void indirect_decrement_key (struct cpu_key * key) +{ + cpu_key_k_offset_dec (key); + if (cpu_key_k_offset (key) == 0) + set_cpu_key_k_type (key, TYPE_STAT_DATA); +} + + +// if it is not first item of the body, then it is mergeable +static int indirect_is_left_mergeable (struct key * key, unsigned long bsize) +{ + int version = le_key_version (key); + return (le_key_k_offset (version, key) != 1); +} + + +// printing of indirect item +static void start_new_sequence (__u32 * start, int * len, __u32 new) +{ + *start = new; + *len = 1; +} + + +static int sequence_finished (__u32 start, int * len, __u32 new) +{ + if (start == INT_MAX) + return 1; + + if (start == 0 && new == 0) { + (*len) ++; + return 0; + } + if (start != 0 && (start + *len) == new) { + (*len) ++; + return 0; + } + return 1; +} + +static void print_sequence (__u32 start, int len) +{ + if (start == INT_MAX) + return; + + if (len == 1) + printk (" %d", start); + else + printk (" %d(%d)", start, len); +} + + +static void indirect_print_item (struct item_head * ih, char * item) +{ + int j; + __u32 * unp, prev = INT_MAX; + int num; + + unp = (__u32 *)item; + + if (ih->ih_item_len % UNFM_P_SIZE) + printk ("indirect_print_item: invalid item len"); + + printk ("%d pointers\n[ ", (int)I_UNFM_NUM (ih)); + for (j = 0; j < I_UNFM_NUM (ih); j ++) { + if (sequence_finished (prev, &num, unp[j])) { + print_sequence (prev, num); + start_new_sequence (&prev, &num, unp[j]); + } + } + print_sequence (prev, num); + printk ("]\n"); +} + +static void indirect_check_item (struct item_head * ih, char * item) +{ + // FIXME: type something here! +} + + +static int indirect_create_vi (struct virtual_node * vn, + struct virtual_item * vi, + int is_affected, + int insert_size) +{ + vi->vi_index = TYPE_INDIRECT; + //vi->vi_type |= VI_TYPE_INDIRECT; + return 0; +} + +static int indirect_check_left (struct virtual_item * vi, int free, + int start_skip, int end_skip) +{ + int bytes; + + bytes = free - free % UNFM_P_SIZE; + return bytes ?: -1; +} + + +static int indirect_check_right (struct virtual_item * vi, int free) +{ + return indirect_check_left (vi, free, 0, 0); +} + + + +// return size in bytes of 'units' units. If first == 0 - calculate from the head (left), otherwise - from tail (right) +static int indirect_part_size (struct virtual_item * vi, int first, int units) +{ + // unit of indirect item is byte (yet) + return units; +} + +static int indirect_unit_num (struct virtual_item * vi) +{ + // unit of indirect item is byte (yet) + return vi->vi_item_len - IH_SIZE; +} + +static void indirect_print_vi (struct virtual_item * vi) +{ + reiserfs_warning ("INDIRECT, index %d, type 0x%x, %h\n", + vi->vi_index, vi->vi_type, vi->vi_ih); +} + +struct item_operations indirect_ops = { + indirect_bytes_number, + indirect_decrement_key, + indirect_is_left_mergeable, + indirect_print_item, + indirect_check_item, + + indirect_create_vi, + indirect_check_left, + indirect_check_right, + indirect_part_size, + indirect_unit_num, + indirect_print_vi +}; + + +////////////////////////////////////////////////////////////////////////////// +// direntry functions +// + + +static int direntry_bytes_number (struct item_head * ih, int block_size) +{ + reiserfs_warning ("vs-16090: direntry_bytes_number: " + "bytes number is asked for direntry"); + return 0; +} + +static void direntry_decrement_key (struct cpu_key * key) +{ + cpu_key_k_offset_dec (key); + if (cpu_key_k_offset (key) == 0) + set_cpu_key_k_type (key, TYPE_STAT_DATA); +} + + +static int direntry_is_left_mergeable (struct key * key, unsigned long bsize) +{ + if (le32_to_cpu (key->u.k_offset_v1.k_offset) == DOT_OFFSET) + return 0; + return 1; + +} + + +static void direntry_print_item (struct item_head * ih, char * item) +{ + int i; + int namelen; + struct reiserfs_de_head * deh; + char * name; + static char namebuf [80]; + + + printk ("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name", "Key of pointed object", "Hash", "Gen number", "Status"); + + deh = (struct reiserfs_de_head *)item; + + for (i = 0; i < I_ENTRY_COUNT (ih); i ++, deh ++) { + namelen = (i ? ((deh - 1)->deh_location) : ih->ih_item_len) - deh->deh_location; + name = item + deh->deh_location; + if (name[namelen-1] == 0) + namelen = strlen (name); + namebuf[0] = '"'; + if (namelen > sizeof (namebuf) - 3) { + strncpy (namebuf + 1, name, sizeof (namebuf) - 3); + namebuf[sizeof (namebuf) - 2] = '"'; + namebuf[sizeof (namebuf) - 1] = 0; + } else { + memcpy (namebuf + 1, name, namelen); + namebuf[namelen + 1] = '"'; + namebuf[namelen + 2] = 0; + } + + printk ("%d: %-15s%-15d%-15d%-15Ld%-15Ld(%s)\n", + i, namebuf, + deh->deh_dir_id, deh->deh_objectid, + GET_HASH_VALUE (deh_offset (deh)), GET_GENERATION_NUMBER ((deh_offset (deh))), + (de_hidden (deh)) ? "HIDDEN" : "VISIBLE"); + } +} + + +static void direntry_check_item (struct item_head * ih, char * item) +{ + int i; + struct reiserfs_de_head * deh; + + // FIXME: type something here! + deh = (struct reiserfs_de_head *)item; + for (i = 0; i < I_ENTRY_COUNT (ih); i ++, deh ++) { + ; + } +} + + + +#define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1 + +struct direntry_uarea { + int flags; + short entry_count; + short entry_sizes[1]; +}; + + +/* + * function returns old entry number in directory item in real node + * using new entry number in virtual item in virtual node */ +static inline int old_entry_num (int is_affected, int virtual_entry_num, int pos_in_item, int mode) +{ + if ( mode == M_INSERT || mode == M_DELETE) + return virtual_entry_num; + + if (!is_affected) + /* cut or paste is applied to another item */ + return virtual_entry_num; + + if (virtual_entry_num < pos_in_item) + return virtual_entry_num; + + if (mode == M_CUT) + return virtual_entry_num + 1; + +#ifdef CONFIG_REISERFS_CHECK + if (mode != M_PASTE || virtual_entry_num == 0) + reiserfs_panic (0, "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'", mode); +#endif + + return virtual_entry_num - 1; +} + + + + +/* Create an array of sizes of directory entries for virtual + item. Return space used by an item. FIXME: no control over + consuming of space used by this item handler */ +static int direntry_create_vi (struct virtual_node * vn, + struct virtual_item * vi, + int is_affected, + int insert_size) +{ + struct direntry_uarea * dir_u = vi->vi_uarea; + int i, j; + int size = sizeof (struct direntry_uarea); + struct reiserfs_de_head * deh; + + vi->vi_index = TYPE_DIRENTRY; + + if (!(vi->vi_ih) || !vi->vi_item) + BUG (); + + + dir_u->flags = 0; + if (le_ih_k_offset (vi->vi_ih) == DOT_OFFSET) + dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM; + + deh = (struct reiserfs_de_head *)(vi->vi_item); + + + /* virtual directory item have this amount of entry after */ + dir_u->entry_count = ih_entry_count (vi->vi_ih) + + ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 : + (vn->vn_mode == M_PASTE ? 1 : 0)) : 0); + + for (i = 0; i < dir_u->entry_count; i ++) { + j = old_entry_num (is_affected, i, vn->vn_pos_in_item, vn->vn_mode); + dir_u->entry_sizes[i] = (j ? le16_to_cpu (deh[j - 1].deh_location) : le16_to_cpu (vi->vi_ih->ih_item_len)) - + le16_to_cpu (deh[j].deh_location) + DEH_SIZE; + } + + size += (dir_u->entry_count * sizeof (short)); + + /* set size of pasted entry */ + if (is_affected && vn->vn_mode == M_PASTE) + dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size; + + +#ifdef CONFIG_REISERFS_CHECK + /* compare total size of entries with item length */ + { + int k, l; + + l = 0; + for (k = 0; k < dir_u->entry_count; k ++) + l += dir_u->entry_sizes[k]; + + if (l + IH_SIZE != vi->vi_item_len + + ((is_affected && (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT)) ? insert_size : 0) ) { + reiserfs_panic (0, "vs-8025: set_entry_sizes: (mode==%c, insert_size==%d), invalid length of directory item", + vn->vn_mode, insert_size); + } + } +#endif + + return size; + + +} + + +// +// return number of entries which may fit into specified amount of +// free space, or -1 if free space is not enough even for 1 entry +// +static int direntry_check_left (struct virtual_item * vi, int free, + int start_skip, int end_skip) +{ + int i; + int entries = 0; + struct direntry_uarea * dir_u = vi->vi_uarea; + + for (i = start_skip; i < dir_u->entry_count - end_skip; i ++) { + if (dir_u->entry_sizes[i] > free) + /* i-th entry doesn't fit into the remaining free space */ + break; + + free -= dir_u->entry_sizes[i]; + entries ++; + } + + if (entries == dir_u->entry_count) { + printk ("free spze %d, entry_count %d\n", free, dir_u->entry_count); + BUG (); + } + + /* "." and ".." can not be separated from each other */ + if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) && entries < 2) + entries = 0; + + return entries ?: -1; +} + + +static int direntry_check_right (struct virtual_item * vi, int free) +{ + int i; + int entries = 0; + struct direntry_uarea * dir_u = vi->vi_uarea; + + for (i = dir_u->entry_count - 1; i >= 0; i --) { + if (dir_u->entry_sizes[i] > free) + /* i-th entry doesn't fit into the remaining free space */ + break; + + free -= dir_u->entry_sizes[i]; + entries ++; + } + if (entries == dir_u->entry_count) + BUG (); + + /* "." and ".." can not be separated from each other */ + if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) && entries > dir_u->entry_count - 2) + entries = dir_u->entry_count - 2; + + return entries ?: -1; +} + + +/* sum of entry sizes between from-th and to-th entries including both edges */ +static int direntry_part_size (struct virtual_item * vi, int first, int count) +{ + int i, retval; + int from, to; + struct direntry_uarea * dir_u = vi->vi_uarea; + + retval = 0; + if (first == 0) + from = 0; + else + from = dir_u->entry_count - count; + to = from + count - 1; + + for (i = from; i <= to; i ++) + retval += dir_u->entry_sizes[i]; + + return retval; +} + +static int direntry_unit_num (struct virtual_item * vi) +{ + struct direntry_uarea * dir_u = vi->vi_uarea; + + return dir_u->entry_count; +} + + + +static void direntry_print_vi (struct virtual_item * vi) +{ + int i; + struct direntry_uarea * dir_u = vi->vi_uarea; + + reiserfs_warning ("DIRENTRY, index %d, type 0x%x, %h, flags 0x%x\n", + vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags); + printk ("%d entries: ", dir_u->entry_count); + for (i = 0; i < dir_u->entry_count; i ++) + printk ("%d ", dir_u->entry_sizes[i]); + printk ("\n"); +} + +struct item_operations direntry_ops = { + direntry_bytes_number, + direntry_decrement_key, + direntry_is_left_mergeable, + direntry_print_item, + direntry_check_item, + + direntry_create_vi, + direntry_check_left, + direntry_check_right, + direntry_part_size, + direntry_unit_num, + direntry_print_vi +}; + + +////////////////////////////////////////////////////////////////////////////// +// +// +#if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3) + do not compile +#endif + +struct item_operations * item_ops [4] = { + &stat_data_ops, + &indirect_ops, + &direct_ops, + &direntry_ops +}; + + + + diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/journal.c linux/fs/reiserfs/journal.c --- v2.4.0/linux/fs/reiserfs/journal.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/journal.c Mon Jan 15 15:31:19 2001 @@ -0,0 +1,3215 @@ +/* +** Write ahead logging implementation copyright Chris Mason 2000 +** +** The background commits make this code very interelated, and +** overly complex. I need to rethink things a bit....The major players: +** +** journal_begin -- call with the number of blocks you expect to log. +** If the current transaction is too +** old, it will block until the current transaction is +** finished, and then start a new one. +** Usually, your transaction will get joined in with +** previous ones for speed. +** +** journal_join -- same as journal_begin, but won't block on the current +** transaction regardless of age. Don't ever call +** this. Ever. There are only two places it should be +** called from, and they are both inside this file. +** +** journal_mark_dirty -- adds blocks into this transaction. clears any flags +** that might make them get sent to disk +** and then marks them BH_JDirty. Puts the buffer head +** into the current transaction hash. +** +** journal_end -- if the current transaction is batchable, it does nothing +** otherwise, it could do an async/synchronous commit, or +** a full flush of all log and real blocks in the +** transaction. +** +** flush_old_commits -- if the current transaction is too old, it is ended and +** commit blocks are sent to disk. Forces commit blocks +** to disk for all backgrounded commits that have been +** around too long. +** -- Note, if you call this as an immediate flush from +** from within kupdate, it will ignore the immediate flag +** +** The commit thread -- a writer process for async commits. It allows a +** a process to request a log flush on a task queue. +** the commit will happen once the commit thread wakes up. +** The benefit here is the writer (with whatever +** related locks it has) doesn't have to wait for the +** log blocks to hit disk if it doesn't want to. +*/ + +#ifdef __KERNEL__ + +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#else + +#include "nokernel.h" + +#endif + + +/* the number of mounted filesystems. This is used to decide when to +** start and kill the commit thread +*/ +static int reiserfs_mounted_fs_count = 0 ; + +/* wake this up when you add something to the commit thread task queue */ +DECLARE_WAIT_QUEUE_HEAD(reiserfs_commit_thread_wait) ; + +/* wait on this if you need to be sure you task queue entries have been run */ +static DECLARE_WAIT_QUEUE_HEAD(reiserfs_commit_thread_done) ; +DECLARE_TASK_QUEUE(reiserfs_commit_thread_tq) ; + +#define JOURNAL_TRANS_HALF 1018 /* must be correct to keep the desc and commit structs at 4k */ + +/* cnode stat bits. Move these into reiserfs_fs.h */ + +#define BLOCK_FREED 2 /* this block was freed, and can't be written. */ +#define BLOCK_FREED_HOLDER 3 /* this block was freed during this transaction, and can't be written */ + +#define BLOCK_NEEDS_FLUSH 4 /* used in flush_journal_list */ + +/* flags for do_journal_end */ +#define FLUSH_ALL 1 /* flush commit and real blocks */ +#define COMMIT_NOW 2 /* end and commit this transaction */ +#define WAIT 4 /* wait for the log blocks to hit the disk*/ + +/* state bits for the journal */ +#define WRITERS_BLOCKED 1 /* set when new writers not allowed */ + +static int do_journal_end(struct reiserfs_transaction_handle *,struct super_block *,unsigned long nblocks,int flags) ; +static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ; +static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ; +static int can_dirty(struct reiserfs_journal_cnode *cn) ; + +static void init_journal_hash(struct super_block *p_s_sb) { + memset(SB_JOURNAL(p_s_sb)->j_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ; +} + +/* +** clears BH_Dirty and sticks the buffer on the clean list. Called because I can't allow refile_buffer to +** make schedule happen after I've freed a block. Look at remove_from_transaction and journal_mark_freed for +** more details. +*/ +static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) { + if (bh) { + clear_bit(BH_Dirty, &bh->b_state) ; +#if 0 + if (bh->b_list != BUF_CLEAN) { + reiserfs_file_buffer(bh, BUF_CLEAN) ; + } +#endif + } + return 0 ; +} + +static struct reiserfs_bitmap_node * +allocate_bitmap_node(struct super_block *p_s_sb) { + struct reiserfs_bitmap_node *bn ; + static int id = 0 ; + + bn = kmalloc(sizeof(struct reiserfs_bitmap_node), GFP_BUFFER) ; + if (!bn) { + return NULL ; + } + bn->data = kmalloc(p_s_sb->s_blocksize, GFP_BUFFER) ; + if (!bn->data) { + kfree(bn) ; + return NULL ; + } + bn->id = id++ ; + memset(bn->data, 0, p_s_sb->s_blocksize) ; + INIT_LIST_HEAD(&bn->list) ; + return bn ; +} + +static struct reiserfs_bitmap_node * +get_bitmap_node(struct super_block *p_s_sb) { + struct reiserfs_bitmap_node *bn = NULL; + struct list_head *entry = SB_JOURNAL(p_s_sb)->j_bitmap_nodes.next ; + + SB_JOURNAL(p_s_sb)->j_used_bitmap_nodes++ ; +repeat: + + if(entry != &SB_JOURNAL(p_s_sb)->j_bitmap_nodes) { + bn = list_entry(entry, struct reiserfs_bitmap_node, list) ; + list_del(entry) ; + memset(bn->data, 0, p_s_sb->s_blocksize) ; + SB_JOURNAL(p_s_sb)->j_free_bitmap_nodes-- ; + return bn ; + } + bn = allocate_bitmap_node(p_s_sb) ; + if (!bn) { + current->policy = SCHED_YIELD ; + schedule() ; + goto repeat ; + } + return bn ; +} +static inline void free_bitmap_node(struct super_block *p_s_sb, + struct reiserfs_bitmap_node *bn) { + SB_JOURNAL(p_s_sb)->j_used_bitmap_nodes-- ; + if (SB_JOURNAL(p_s_sb)->j_free_bitmap_nodes > REISERFS_MAX_BITMAP_NODES) { + kfree(bn->data) ; + kfree(bn) ; + } else { + list_add(&bn->list, &SB_JOURNAL(p_s_sb)->j_bitmap_nodes) ; + SB_JOURNAL(p_s_sb)->j_free_bitmap_nodes++ ; + } +} + +static void allocate_bitmap_nodes(struct super_block *p_s_sb) { + int i ; + struct reiserfs_bitmap_node *bn = NULL ; + for (i = 0 ; i < REISERFS_MIN_BITMAP_NODES ; i++) { + bn = allocate_bitmap_node(p_s_sb) ; + if (bn) { + list_add(&bn->list, &SB_JOURNAL(p_s_sb)->j_bitmap_nodes) ; + SB_JOURNAL(p_s_sb)->j_free_bitmap_nodes++ ; + } else { + break ; // this is ok, we'll try again when more are needed + } + } +} + +static int set_bit_in_list_bitmap(struct super_block *p_s_sb, int block, + struct reiserfs_list_bitmap *jb) { + int bmap_nr = block / (p_s_sb->s_blocksize << 3) ; + int bit_nr = block % (p_s_sb->s_blocksize << 3) ; + + if (!jb->bitmaps[bmap_nr]) { + jb->bitmaps[bmap_nr] = get_bitmap_node(p_s_sb) ; + } + set_bit(bit_nr, jb->bitmaps[bmap_nr]->data) ; + return 0 ; +} + +static void cleanup_bitmap_list(struct super_block *p_s_sb, + struct reiserfs_list_bitmap *jb) { + int i; + for (i = 0 ; i < SB_BMAP_NR(p_s_sb) ; i++) { + if (jb->bitmaps[i]) { + free_bitmap_node(p_s_sb, jb->bitmaps[i]) ; + jb->bitmaps[i] = NULL ; + } + } +} + +/* +** only call this on FS unmount. +*/ +static int free_list_bitmaps(struct super_block *p_s_sb, + struct reiserfs_list_bitmap *jb_array) { + int i ; + struct reiserfs_list_bitmap *jb ; + for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) { + jb = jb_array + i ; + jb->journal_list = NULL ; + cleanup_bitmap_list(p_s_sb, jb) ; + vfree(jb->bitmaps) ; + jb->bitmaps = NULL ; + } + return 0; +} + +static int free_bitmap_nodes(struct super_block *p_s_sb) { + struct list_head *next = SB_JOURNAL(p_s_sb)->j_bitmap_nodes.next ; + struct reiserfs_bitmap_node *bn ; + + while(next != &SB_JOURNAL(p_s_sb)->j_bitmap_nodes) { + bn = list_entry(next, struct reiserfs_bitmap_node, list) ; + list_del(next) ; + kfree(bn->data) ; + kfree(bn) ; + next = SB_JOURNAL(p_s_sb)->j_bitmap_nodes.next ; + SB_JOURNAL(p_s_sb)->j_free_bitmap_nodes-- ; + } + + return 0 ; +} + +/* +** get memory for JOURNAL_NUM_BITMAPS worth of bitmaps. +** jb_array is the array to be filled in. +*/ +int reiserfs_allocate_list_bitmaps(struct super_block *p_s_sb, + struct reiserfs_list_bitmap *jb_array, + int bmap_nr) { + int i ; + int failed = 0 ; + struct reiserfs_list_bitmap *jb ; + int mem = bmap_nr * sizeof(struct reiserfs_bitmap_node *) ; + + for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) { + jb = jb_array + i ; + jb->journal_list = NULL ; + jb->bitmaps = vmalloc( mem ) ; + if (!jb->bitmaps) { + reiserfs_warning("clm-2000, unable to allocate bitmaps for journal lists\n") ; + failed = 1; + break ; + } + memset(jb->bitmaps, 0, mem) ; + } + if (failed) { + free_list_bitmaps(p_s_sb, jb_array) ; + return -1 ; + } + return 0 ; +} + +/* +** find an available list bitmap. If you can't find one, flush a commit list +** and try again +*/ +static struct reiserfs_list_bitmap * +get_list_bitmap(struct super_block *p_s_sb, struct reiserfs_journal_list *jl) { + int i,j ; + struct reiserfs_list_bitmap *jb = NULL ; + + for (j = 0 ; j < (JOURNAL_NUM_BITMAPS * 3) ; j++) { + i = SB_JOURNAL(p_s_sb)->j_list_bitmap_index ; + SB_JOURNAL(p_s_sb)->j_list_bitmap_index = (i + 1) % JOURNAL_NUM_BITMAPS ; + jb = SB_JOURNAL(p_s_sb)->j_list_bitmap + i ; + if (SB_JOURNAL(p_s_sb)->j_list_bitmap[i].journal_list) { + flush_commit_list(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap[i].journal_list, 1) ; + if (!SB_JOURNAL(p_s_sb)->j_list_bitmap[i].journal_list) { + break ; + } + } else { + break ; + } + } + if (jb->journal_list) { /* double check to make sure if flushed correctly */ + return NULL ; + } + jb->journal_list = jl ; + return jb ; +} + +/* +** allocates a new chunk of X nodes, and links them all together as a list. +** Uses the cnode->next and cnode->prev pointers +** returns NULL on failure +*/ +static struct reiserfs_journal_cnode *allocate_cnodes(int num_cnodes) { + struct reiserfs_journal_cnode *head ; + int i ; + if (num_cnodes <= 0) { + return NULL ; + } + head = vmalloc(num_cnodes * sizeof(struct reiserfs_journal_cnode)) ; + if (!head) { + return NULL ; + } + memset(head, 0, num_cnodes * sizeof(struct reiserfs_journal_cnode)) ; + head[0].prev = NULL ; + head[0].next = head + 1 ; + for (i = 1 ; i < num_cnodes; i++) { + head[i].prev = head + (i - 1) ; + head[i].next = head + (i + 1) ; /* if last one, overwrite it after the if */ + } + head[num_cnodes -1].next = NULL ; + return head ; +} + +/* +** pulls a cnode off the free list, or returns NULL on failure +*/ +static struct reiserfs_journal_cnode *get_cnode(struct super_block *p_s_sb) { + struct reiserfs_journal_cnode *cn ; + + reiserfs_check_lock_depth("get_cnode") ; + + if (SB_JOURNAL(p_s_sb)->j_cnode_free <= 0) { + return NULL ; + } + SB_JOURNAL(p_s_sb)->j_cnode_used++ ; + SB_JOURNAL(p_s_sb)->j_cnode_free-- ; + cn = SB_JOURNAL(p_s_sb)->j_cnode_free_list ; + if (!cn) { + return cn ; + } + if (cn->next) { + cn->next->prev = NULL ; + } + SB_JOURNAL(p_s_sb)->j_cnode_free_list = cn->next ; + memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; + return cn ; +} + +/* +** returns a cnode to the free list +*/ +static void free_cnode(struct super_block *p_s_sb, struct reiserfs_journal_cnode *cn) { + + reiserfs_check_lock_depth("free_cnode") ; + + SB_JOURNAL(p_s_sb)->j_cnode_used-- ; + SB_JOURNAL(p_s_sb)->j_cnode_free++ ; + /* memset(cn, 0, sizeof(struct reiserfs_journal_cnode)) ; */ + cn->next = SB_JOURNAL(p_s_sb)->j_cnode_free_list ; + if (SB_JOURNAL(p_s_sb)->j_cnode_free_list) { + SB_JOURNAL(p_s_sb)->j_cnode_free_list->prev = cn ; + } + cn->prev = NULL ; /* not needed with the memset, but I might kill the memset, and forget to do this */ + SB_JOURNAL(p_s_sb)->j_cnode_free_list = cn ; +} + +static int clear_prepared_bits(struct buffer_head *bh) { + clear_bit(BH_JPrepared, &bh->b_state) ; + return 0 ; +} + +/* buffer is in current transaction */ +inline int buffer_journaled(struct buffer_head *bh) { + if (bh) + return test_bit(BH_JDirty, &bh->b_state) ; + else + return 0 ; +} + +/* disk block was taken off free list before being in a finished transation, or written to disk +** journal_new blocks can be reused immediately, for any purpose +*/ +inline int buffer_journal_new(struct buffer_head *bh) { + if (bh) + return test_bit(BH_JNew, &bh->b_state) ; + else + return 0 ; +} + +inline int mark_buffer_journal_new(struct buffer_head *bh) { + if (bh) { + set_bit(BH_JNew, &bh->b_state) ; + } + return 0 ; +} + +inline int mark_buffer_not_journaled(struct buffer_head *bh) { + if (bh) + clear_bit(BH_JDirty, &bh->b_state) ; + return 0 ; +} + +/* utility function to force a BUG if it is called without the big +** kernel lock held. caller is the string printed just before calling BUG() +*/ +void reiserfs_check_lock_depth(char *caller) { +#ifdef CONFIG_SMP + if (current->lock_depth < 0) { + printk("%s called without kernel lock held\n", caller) ; + show_reiserfs_locks() ; + BUG() ; + } +#else + ; +#endif +} + +/* return a cnode with same dev, block number and size in table, or null if not found */ +static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct reiserfs_journal_cnode **table, + kdev_t dev,long bl,int size) { + struct reiserfs_journal_cnode *cn ; + cn = journal_hash(table, dev, bl) ; + while(cn) { + if ((cn->blocknr == bl) && (cn->dev == dev)) + return cn ; + cn = cn->hnext ; + } + return (struct reiserfs_journal_cnode *)0 ; +} + +/* returns a cnode with same size, block number and dev as bh in the current transaction hash. NULL if not found */ +static inline struct reiserfs_journal_cnode *get_journal_hash(struct super_block *p_s_sb, struct buffer_head *bh) { + struct reiserfs_journal_cnode *cn ; + if (bh) { + cn = get_journal_hash_dev(SB_JOURNAL(p_s_sb)->j_hash_table, bh->b_dev, bh->b_blocknr, bh->b_size) ; + } + else { + return (struct reiserfs_journal_cnode *)0 ; + } + return cn ; +} + +/* once upon a time, the journal would deadlock. a lot. Now, when +** CONFIG_REISERFS_CHECK is defined, anytime someone enters a +** transaction, it pushes itself into this ugly static list, and pops +** itself off before calling journal_end. I made a SysRq key to dump +** the list, and tell me what the writers are when I'm deadlocked. */ + + /* are you depending on the compiler + to optimize this function away + everywhere it is called? It is not + obvious how this works, but I + suppose debugging code need not be + clear. -Hans */ +static char *journal_writers[512] ; +int push_journal_writer(char *s) { +#ifdef CONFIG_REISERFS_CHECK + int i ; + for (i = 0 ; i < 512 ; i++) { + if (!journal_writers[i]) { + journal_writers[i] = s ; + return i ; + } + } + return -1 ; +#else + return 0 ; +#endif +} +int pop_journal_writer(int index) { +#ifdef CONFIG_REISERFS_CHECK + if (index >= 0) { + journal_writers[index] = NULL ; + } +#endif + return 0 ; +} + +int dump_journal_writers(void) { + int i ; + for (i = 0 ; i < 512 ; i++) { + if (journal_writers[i]) { + printk("%d: %s\n", i, journal_writers[i]) ; + } + } + return 0 ; +} + +/* +** this actually means 'can this block be reallocated yet?'. If you set search_all, a block can only be allocated +** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever +** being overwritten by a replay after crashing. +** +** If you don't set search_all, a block can only be allocated if it is not in the current transaction. Since deleting +** a block removes it from the current transaction, this case should never happen. If you don't set search_all, make +** sure you never write the block without logging it. +** +** next_zero_bit is a suggestion about the next block to try for find_forward. +** when bl is rejected because it is set in a journal list bitmap, we search +** for the next zero bit in the bitmap that rejected bl. Then, we return that +** through next_zero_bit for find_forward to try. +** +** Just because we return something in next_zero_bit does not mean we won't +** reject it on the next call to reiserfs_in_journal +** +*/ +int reiserfs_in_journal(struct super_block *p_s_sb, kdev_t dev, + unsigned long bl, int size, int search_all, + unsigned long *next_zero_bit) { + struct reiserfs_journal_cnode *cn ; + struct reiserfs_list_bitmap *jb ; + int i ; + int bmap_nr = bl / (p_s_sb->s_blocksize << 3) ; + int bit_nr = bl % (p_s_sb->s_blocksize << 3) ; + int tmp_bit ; + + *next_zero_bit = 0 ; /* always start this at zero. */ + + /* we aren't logging all blocks are safe for reuse */ + if (reiserfs_dont_log(p_s_sb)) { + return 0 ; + } + + /* If we aren't doing a search_all, this is a metablock, and it will be logged before use. + ** if we crash before the transaction that freed it commits, this transaction won't + ** have committed either, and the block will never be written + */ + if (search_all) { + for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) { + jb = SB_JOURNAL(p_s_sb)->j_list_bitmap + i ; + if (jb->journal_list && jb->bitmaps[bmap_nr] && + test_bit(bit_nr, jb->bitmaps[bmap_nr]->data)) { + tmp_bit = find_next_zero_bit((unsigned long *) + (jb->bitmaps[bmap_nr]->data), + p_s_sb->s_blocksize << 3, bit_nr+1) ; + *next_zero_bit = bmap_nr * (p_s_sb->s_blocksize << 3) + tmp_bit ; + return 1 ; + } + } + } + + /* is it in any old transactions? */ + if (search_all && (cn = get_journal_hash_dev(SB_JOURNAL(p_s_sb)->j_list_hash_table, dev,bl,size))) { + return 1; + } + + /* is it in the current transaction. This should never happen */ + if ((cn = get_journal_hash_dev(SB_JOURNAL(p_s_sb)->j_hash_table, dev,bl,size))) { + return 1; + } + + /* safe for reuse */ + return 0 ; +} + +/* insert cn into table +*/ +inline void insert_journal_hash(struct reiserfs_journal_cnode **table, struct reiserfs_journal_cnode *cn) { + struct reiserfs_journal_cnode *cn_orig ; + + cn_orig = journal_hash(table, cn->dev, cn->blocknr) ; + cn->hnext = cn_orig ; + cn->hprev = NULL ; + if (cn_orig) { + cn_orig->hprev = cn ; + } + journal_hash(table, cn->dev, cn->blocknr) = cn ; +} + +/* lock the current transaction */ +inline static void lock_journal(struct super_block *p_s_sb) { + while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)) > 0) { + sleep_on(&(SB_JOURNAL(p_s_sb)->j_wait)) ; + } + atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 1) ; +} + +/* unlock the current transaction */ +inline static void unlock_journal(struct super_block *p_s_sb) { + atomic_dec(&(SB_JOURNAL(p_s_sb)->j_wlock)) ; + wake_up(&(SB_JOURNAL(p_s_sb)->j_wait)) ; +} + +/* +** this used to be much more involved, and I'm keeping it just in case things get ugly again. +** it gets called by flush_commit_list, and cleans up any data stored about blocks freed during a +** transaction. +*/ +static void cleanup_freed_for_journal_list(struct super_block *p_s_sb, struct reiserfs_journal_list *jl) { + + struct reiserfs_list_bitmap *jb = jl->j_list_bitmap ; + if (jb) { + cleanup_bitmap_list(p_s_sb, jb) ; + } + jl->j_list_bitmap->journal_list = NULL ; + jl->j_list_bitmap = NULL ; +} + +/* +** if this journal list still has commit blocks unflushed, send them to disk. +** +** log areas must be flushed in order (transaction 2 can't commit before transaction 1) +** Before the commit block can by written, every other log block must be safely on disk +** +*/ +static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) { + int i, count ; + int index = 0 ; + int bn ; + int retry_count = 0 ; + int orig_commit_left = 0 ; + struct buffer_head *tbh = NULL ; + struct reiserfs_journal_list *other_jl ; + + reiserfs_check_lock_depth("flush_commit_list") ; + + if (atomic_read(&jl->j_older_commits_done)) { + return 0 ; + } + + /* before we can put our commit blocks on disk, we have to make sure everyone older than + ** us is on disk too + */ + if (jl->j_len <= 0) { + return 0 ; + } + if (flushall) { + /* we _must_ make sure the transactions are committed in order. Start with the + ** index after this one, wrap all the way around + */ + index = (jl - SB_JOURNAL_LIST(s)) + 1 ; + for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) { + other_jl = SB_JOURNAL_LIST(s) + ( (index + i) % JOURNAL_LIST_COUNT) ; + if (other_jl && other_jl != jl && other_jl->j_len > 0 && other_jl->j_trans_id > 0 && + other_jl->j_trans_id <= jl->j_trans_id && (atomic_read(&(jl->j_older_commits_done)) == 0)) { + flush_commit_list(s, other_jl, 0) ; + } + } + } + + count = 0 ; + /* don't flush the commit list for the current transactoin */ + if (jl == ((SB_JOURNAL_LIST(s) + SB_JOURNAL_LIST_INDEX(s)))) { + return 0 ; + } + + /* make sure nobody is trying to flush this one at the same time */ + if (atomic_read(&(jl->j_commit_flushing))) { + sleep_on(&(jl->j_commit_wait)) ; + if (flushall) { + atomic_set(&(jl->j_older_commits_done), 1) ; + } + return 0 ; + } + + /* this commit is done, exit */ + if (atomic_read(&(jl->j_commit_left)) <= 0) { + if (flushall) { + atomic_set(&(jl->j_older_commits_done), 1) ; + } + return 0 ; + } + /* keeps others from flushing while we are flushing */ + atomic_set(&(jl->j_commit_flushing), 1) ; + + + if (jl->j_len > JOURNAL_TRANS_MAX) { + reiserfs_panic(s, "journal-512: flush_commit_list: length is %lu, list number %d\n", jl->j_len, jl - SB_JOURNAL_LIST(s)) ; + return 0 ; + } + + orig_commit_left = atomic_read(&(jl->j_commit_left)) ; + + /* start by checking all the commit blocks in this transaction. + ** Add anyone not on disk into tbh. Stop checking once commit_left <= 1, because that means we + ** only have the commit block left + */ +retry: + count = 0 ; + for (i = 0 ; atomic_read(&(jl->j_commit_left)) > 1 && i < (jl->j_len + 1) ; i++) { /* everything but commit_bh */ + bn = reiserfs_get_journal_block(s) + (jl->j_start+i) % JOURNAL_BLOCK_COUNT; + tbh = get_hash_table(s->s_dev, bn, s->s_blocksize) ; + +/* kill this sanity check */ +if (count > (orig_commit_left + 2)) { +reiserfs_panic(s, "journal-539: flush_commit_list: BAD count(%d) > orig_commit_left(%d)!\n", count, orig_commit_left) ; +} + if (tbh) { + if (buffer_locked(tbh)) { /* wait on it, redo it just to make sure */ + wait_on_buffer(tbh) ; + if (!buffer_uptodate(tbh)) { + reiserfs_panic(s, "journal-584, buffer write failed\n") ; + } + } + if (buffer_dirty(tbh)) { + printk("journal-569: flush_commit_list, block already dirty!\n") ; + } else { + mark_buffer_dirty(tbh) ; + } + ll_rw_block(WRITE, 1, &tbh) ; + count++ ; + atomic_dec(&(tbh->b_count)) ; /* once for our get_hash */ + } + } + + /* wait on everyone in tbh before writing commit block*/ + if (count > 0) { + for (i = 0 ; atomic_read(&(jl->j_commit_left)) > 1 && + i < (jl->j_len + 1) ; i++) { /* everything but commit_bh */ + bn = reiserfs_get_journal_block(s) + (jl->j_start + i) % JOURNAL_BLOCK_COUNT ; + tbh = get_hash_table(s->s_dev, bn, s->s_blocksize) ; + + wait_on_buffer(tbh) ; + if (!buffer_uptodate(tbh)) { + reiserfs_panic(s, "journal-601, buffer write failed\n") ; + } + atomic_dec(&(tbh->b_count)) ; /* once for our get_hash */ + bforget(tbh) ; /* once due to original getblk in do_journal_end */ + atomic_dec(&(jl->j_commit_left)) ; + } + } + + if (atomic_read(&(jl->j_commit_left)) != 1) { /* just the commit_bh left, flush it without calling getblk for everyone */ + if (retry_count < 2) { + printk("journal-582: flush_commit_list, not all log blocks on disk yet, trying again\n") ; + retry_count++ ; + goto retry; + } + reiserfs_panic(s, "journal-563: flush_commit_list: BAD, j_commit_left is %lu, should be 1\n", + atomic_read(&(jl->j_commit_left))); + } + + mark_buffer_dirty(jl->j_commit_bh) ; + ll_rw_block(WRITE, 1, &(jl->j_commit_bh)) ; + wait_on_buffer(jl->j_commit_bh) ; + if (!buffer_uptodate(jl->j_commit_bh)) { + reiserfs_panic(s, "journal-615: buffer write failed\n") ; + } + atomic_dec(&(jl->j_commit_left)) ; + bforget(jl->j_commit_bh) ; + + /* now, every commit block is on the disk. It is safe to allow blocks freed during this transaction to be reallocated */ + cleanup_freed_for_journal_list(s, jl) ; + + if (flushall) { + atomic_set(&(jl->j_older_commits_done), 1) ; + } + atomic_set(&(jl->j_commit_flushing), 0) ; + wake_up(&(jl->j_commit_wait)) ; + return 0 ; +} + +/* +** flush_journal_list frequently needs to find a newer transaction for a given block. This does that, or +** returns NULL if it can't find anything +*/ +static struct reiserfs_journal_list *find_newer_jl_for_cn(struct reiserfs_journal_cnode *cn) { + kdev_t dev = cn->dev; + unsigned long blocknr = cn->blocknr ; + + cn = cn->hprev ; + while(cn) { + if (cn->dev == dev && cn->blocknr == blocknr && cn->jlist) { + return cn->jlist ; + } + cn = cn->hprev ; + } + return NULL ; +} + + +/* +** once all the real blocks have been flushed, it is safe to remove them from the +** journal list for this transaction. Aside from freeing the cnode, this also allows the +** block to be reallocated for data blocks if it had been deleted. +*/ +static void remove_all_from_journal_list(struct super_block *p_s_sb, struct reiserfs_journal_list *jl, int debug) { + struct buffer_head fake_bh ; + struct reiserfs_journal_cnode *cn, *last ; + cn = jl->j_realblock ; + + /* which is better, to lock once around the whole loop, or + ** to lock for each call to remove_from_journal_list? + */ + while(cn) { + if (cn->blocknr != 0) { + if (debug) { + printk("block %lu, bh is %d, state %d\n", cn->blocknr, cn->bh ? 1: 0, + cn->state) ; + } + fake_bh.b_blocknr = cn->blocknr ; + fake_bh.b_dev = cn->dev ; + cn->state = 0 ; + remove_from_journal_list(p_s_sb, jl, &fake_bh, 1) ; + } + last = cn ; + cn = cn->next ; + free_cnode(p_s_sb, last) ; + } + jl->j_realblock = NULL ; +} + +/* +** if this timestamp is greater than the timestamp we wrote last to the header block, write it to the header block. +** once this is done, I can safely say the log area for this transaction won't ever be replayed, and I can start +** releasing blocks in this transaction for reuse as data blocks. +** called by flush_journal_list, before it calls remove_all_from_journal_list +** +*/ +static int update_journal_header_block(struct super_block *p_s_sb, unsigned long offset, unsigned long trans_id) { + struct reiserfs_journal_header *jh ; + if (trans_id >= SB_JOURNAL(p_s_sb)->j_last_flush_trans_id) { + if (buffer_locked((SB_JOURNAL(p_s_sb)->j_header_bh))) { + wait_on_buffer((SB_JOURNAL(p_s_sb)->j_header_bh)) ; + if (!buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh)) { + reiserfs_panic(p_s_sb, "journal-699: buffer write failed\n") ; + } + } + SB_JOURNAL(p_s_sb)->j_last_flush_trans_id = trans_id ; + SB_JOURNAL(p_s_sb)->j_first_unflushed_offset = offset ; + jh = (struct reiserfs_journal_header *)(SB_JOURNAL(p_s_sb)->j_header_bh->b_data) ; + jh->j_last_flush_trans_id = cpu_to_le32(trans_id) ; + jh->j_first_unflushed_offset = cpu_to_le32(offset) ; + jh->j_mount_id = cpu_to_le32(SB_JOURNAL(p_s_sb)->j_mount_id) ; + set_bit(BH_Dirty, &(SB_JOURNAL(p_s_sb)->j_header_bh->b_state)) ; + ll_rw_block(WRITE, 1, &(SB_JOURNAL(p_s_sb)->j_header_bh)) ; + wait_on_buffer((SB_JOURNAL(p_s_sb)->j_header_bh)) ; + if (!buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh)) { + reiserfs_panic(p_s_sb, "journal-712: buffer write failed\n") ; + } + } + return 0 ; +} + +/* +** flush any and all journal lists older than you are +** can only be called from flush_journal_list +*/ +static int flush_older_journal_lists(struct super_block *p_s_sb, struct reiserfs_journal_list *jl, unsigned long trans_id) { + int i, index ; + struct reiserfs_journal_list *other_jl ; + + index = jl - SB_JOURNAL_LIST(p_s_sb) ; + for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) { + other_jl = SB_JOURNAL_LIST(p_s_sb) + ((index + i) % JOURNAL_LIST_COUNT) ; + if (other_jl && other_jl->j_len > 0 && + other_jl->j_trans_id > 0 && + other_jl->j_trans_id < trans_id && + other_jl != jl) { + /* do not flush all */ + flush_journal_list(p_s_sb, other_jl, 0) ; + } + } + return 0 ; +} + +static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) { + if (buffer_journaled(bh)) { + reiserfs_warning("clm-2084: pinned buffer %u:%s sent to disk\n", + bh->b_blocknr, kdevname(bh->b_dev)) ; + } + mark_buffer_uptodate(bh, uptodate) ; + unlock_buffer(bh) ; +} +static void submit_logged_buffer(struct buffer_head *bh) { + lock_buffer(bh) ; + bh->b_end_io = reiserfs_end_buffer_io_sync ; + mark_buffer_notjournal_new(bh) ; + clear_bit(BH_Dirty, &bh->b_state) ; + submit_bh(WRITE, bh) ; +} + +/* flush a journal list, both commit and real blocks +** +** always set flushall to 1, unless you are calling from inside +** flush_journal_list +** +** IMPORTANT. This can only be called while there are no journal writers, +** and the journal is locked. That means it can only be called from +** do_journal_end, or by journal_release +*/ +static int flush_journal_list(struct super_block *s, + struct reiserfs_journal_list *jl, int flushall) { + struct reiserfs_journal_list *pjl ; + struct reiserfs_journal_cnode *cn, *last ; + int count ; + int was_jwait = 0 ; + int was_dirty = 0 ; + struct buffer_head *saved_bh ; + unsigned long j_len_saved = jl->j_len ; + + if (j_len_saved <= 0) { + return 0 ; + } + + if (atomic_read(&SB_JOURNAL(s)->j_wcount) != 0) { + reiserfs_warning("clm-2048: flush_journal_list called with wcount %d\n", + atomic_read(&SB_JOURNAL(s)->j_wcount)) ; + } + /* if someone is getting the commit list, we must wait for them */ + while (atomic_read(&(jl->j_commit_flushing))) { + sleep_on(&(jl->j_commit_wait)) ; + } + /* if someone is flushing this list, we must wait for them */ + while (atomic_read(&(jl->j_flushing))) { + sleep_on(&(jl->j_flush_wait)) ; + } + + /* this list is now ours, we can change anything we want */ + atomic_set(&(jl->j_flushing), 1) ; + + count = 0 ; + if (j_len_saved > JOURNAL_TRANS_MAX) { + reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, list number %d\n", j_len_saved, jl - SB_JOURNAL_LIST(s)) ; + atomic_dec(&(jl->j_flushing)) ; + return 0 ; + } + + /* if all the work is already done, get out of here */ + if (atomic_read(&(jl->j_nonzerolen)) <= 0 && + atomic_read(&(jl->j_commit_left)) <= 0) { + goto flush_older_and_return ; + } + + /* start by putting the commit list on disk. This will also flush + ** the commit lists of any olders transactions + */ + flush_commit_list(s, jl, 1) ; + + /* are we done now? */ + if (atomic_read(&(jl->j_nonzerolen)) <= 0 && + atomic_read(&(jl->j_commit_left)) <= 0) { + goto flush_older_and_return ; + } + + /* loop through each cnode, see if we need to write it, + ** or wait on a more recent transaction, or just ignore it + */ + if (atomic_read(&(SB_JOURNAL(s)->j_wcount)) != 0) { + reiserfs_panic(s, "journal-844: panic journal list is flushing, wcount is not 0\n") ; + } + cn = jl->j_realblock ; + while(cn) { + was_jwait = 0 ; + was_dirty = 0 ; + saved_bh = NULL ; + /* blocknr of 0 is no longer in the hash, ignore it */ + if (cn->blocknr == 0) { + goto free_cnode ; + } + pjl = find_newer_jl_for_cn(cn) ; + /* the order is important here. We check pjl to make sure we + ** don't clear BH_JDirty_wait if we aren't the one writing this + ** block to disk + */ + if (!pjl && cn->bh) { + saved_bh = cn->bh ; + + /* we do this to make sure nobody releases the buffer while + ** we are working with it + */ + atomic_inc(&(saved_bh->b_count)) ; + + if (buffer_journal_dirty(saved_bh)) { + was_jwait = 1 ; + mark_buffer_notjournal_dirty(saved_bh) ; + /* brelse the inc from journal_mark_dirty */ + atomic_dec(&(saved_bh->b_count)) ; + } + if (can_dirty(cn)) { + was_dirty = 1 ; + } + } + + /* if someone has this block in a newer transaction, just make + ** sure they are commited, and don't try writing it to disk + */ + if (pjl) { + flush_commit_list(s, pjl, 1) ; + goto free_cnode ; + } + + /* bh == NULL when the block got to disk on its own, OR, + ** the block got freed in a future transaction + */ + if (saved_bh == NULL) { + goto free_cnode ; + } + + /* this should never happen. kupdate_one_transaction has this list + ** locked while it works, so we should never see a buffer here that + ** is not marked JDirty_wait + */ + if ((!was_jwait) && !buffer_locked(saved_bh)) { +printk("journal-813: BAD! buffer %lu %cdirty %cjwait, not in a newer tranasction\n", saved_bh->b_blocknr, + was_dirty ? ' ' : '!', was_jwait ? ' ' : '!') ; + } + /* kupdate_one_transaction waits on the buffers it is writing, so we + ** should never see locked buffers here + */ + if (buffer_locked(saved_bh)) { + printk("clm-2083: locked buffer %lu in flush_journal_list\n", + saved_bh->b_blocknr) ; + wait_on_buffer(saved_bh) ; + if (!buffer_uptodate(saved_bh)) { + reiserfs_panic(s, "journal-923: buffer write failed\n") ; + } + } + if (was_dirty) { + /* we inc again because saved_bh gets decremented at free_cnode */ + atomic_inc(&(saved_bh->b_count)) ; + set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ; + submit_logged_buffer(saved_bh) ; + count++ ; + } else { + printk("clm-2082: Unable to flush buffer %lu in flush_journal_list\n", + saved_bh->b_blocknr) ; + } +free_cnode: + last = cn ; + cn = cn->next ; + if (saved_bh) { + /* we incremented this to keep others from taking the buffer head away */ + atomic_dec(&(saved_bh->b_count)); + if (atomic_read(&(saved_bh->b_count)) < 0) { + printk("journal-945: saved_bh->b_count < 0") ; + } + } + } + if (count > 0) { + cn = jl->j_realblock ; + while(cn) { + if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) { + if (!cn->bh) { + reiserfs_panic(s, "journal-1011: cn->bh is NULL\n") ; + } + wait_on_buffer(cn->bh) ; + if (!cn->bh) { + reiserfs_panic(s, "journal-1012: cn->bh is NULL\n") ; + } + if (!buffer_uptodate(cn->bh)) { + reiserfs_panic(s, "journal-949: buffer write failed\n") ; + } + refile_buffer(cn->bh) ; + brelse(cn->bh) ; + } + cn = cn->next ; + } + } + +flush_older_and_return: + /* before we can update the journal header block, we _must_ flush all + ** real blocks from all older transactions to disk. This is because + ** once the header block is updated, this transaction will not be + ** replayed after a crash + */ + if (flushall) { + flush_older_journal_lists(s, jl, jl->j_trans_id) ; + } + + /* before we can remove everything from the hash tables for this + ** transaction, we must make sure it can never be replayed + ** + ** since we are only called from do_journal_end, we know for sure there + ** are no allocations going on while we are flushing journal lists. So, + ** we only need to update the journal header block for the last list + ** being flushed + */ + if (flushall) { + update_journal_header_block(s, (jl->j_start + jl->j_len + 2) % JOURNAL_BLOCK_COUNT, jl->j_trans_id) ; + } + remove_all_from_journal_list(s, jl, 0) ; + jl->j_len = 0 ; + atomic_set(&(jl->j_nonzerolen), 0) ; + jl->j_start = 0 ; + jl->j_realblock = NULL ; + jl->j_commit_bh = NULL ; + jl->j_trans_id = 0 ; + atomic_dec(&(jl->j_flushing)) ; + wake_up(&(jl->j_flush_wait)) ; + return 0 ; +} + + +static int kupdate_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_journal_list *pjl ; /* previous list for this cn */ + struct reiserfs_journal_cnode *cn, *walk_cn ; + unsigned long blocknr ; + int run = 0 ; + int orig_trans_id = jl->j_trans_id ; + struct buffer_head *saved_bh ; + int ret = 0 ; + + /* if someone is getting the commit list, we must wait for them */ + while (atomic_read(&(jl->j_commit_flushing))) { + sleep_on(&(jl->j_commit_wait)) ; + } + /* if someone is flushing this list, we must wait for them */ + while (atomic_read(&(jl->j_flushing))) { + sleep_on(&(jl->j_flush_wait)) ; + } + /* was it flushed while we slept? */ + if (jl->j_len <= 0 || jl->j_trans_id != orig_trans_id) { + return 0 ; + } + + /* this list is now ours, we can change anything we want */ + atomic_set(&(jl->j_flushing), 1) ; + +loop_start: + cn = jl->j_realblock ; + while(cn) { + saved_bh = NULL ; + /* if the blocknr == 0, this has been cleared from the hash, + ** skip it + */ + if (cn->blocknr == 0) { + goto next ; + } + /* look for a more recent transaction that logged this + ** buffer. Only the most recent transaction with a buffer in + ** it is allowed to send that buffer to disk + */ + pjl = find_newer_jl_for_cn(cn) ; + if (run == 0 && !pjl && cn->bh && buffer_journal_dirty(cn->bh) && + can_dirty(cn)) + { + if (!test_bit(BH_JPrepared, &cn->bh->b_state)) { + set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ; + submit_logged_buffer(cn->bh) ; + } else { + /* someone else is using this buffer. We can't + ** send it to disk right now because they might + ** be changing/logging it. + */ + ret = 1 ; + } + } else if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) { + clear_bit(BLOCK_NEEDS_FLUSH, &cn->state) ; + if (!pjl && cn->bh) { + wait_on_buffer(cn->bh) ; + } + /* check again, someone could have logged while we scheduled */ + pjl = find_newer_jl_for_cn(cn) ; + + /* before the JDirty_wait bit is set, the + ** buffer is added to the hash list. So, if we are + ** run in the middle of a do_journal_end, we will notice + ** if this buffer was logged and added from the latest + ** transaction. In this case, we don't want to decrement + ** b_count + */ + if (!pjl && cn->bh && buffer_journal_dirty(cn->bh)) { + blocknr = cn->blocknr ; + walk_cn = cn ; + saved_bh= cn->bh ; + /* update all older transactions to show this block + ** was flushed + */ + mark_buffer_notjournal_dirty(cn->bh) ; + while(walk_cn) { + if (walk_cn->bh && walk_cn->blocknr == blocknr && + walk_cn->dev == cn->dev) { + if (walk_cn->jlist) { + atomic_dec(&(walk_cn->jlist->j_nonzerolen)) ; + } + walk_cn->bh = NULL ; + } + walk_cn = walk_cn->hnext ; + } + if (atomic_read(&saved_bh->b_count) < 1) { + reiserfs_warning("clm-2081: bad count on %lu\n", + saved_bh->b_blocknr) ; + } + brelse(saved_bh) ; + } + } + /* + ** if the more recent transaction is committed to the log, + ** this buffer can be considered flushed. Decrement our + ** counters to reflect one less buffer that needs writing. + ** + ** note, this relies on all of the above code being + ** schedule free once pjl comes back non-null. + */ + if (pjl && cn->bh && atomic_read(&pjl->j_commit_left) == 0) { + atomic_dec(&cn->jlist->j_nonzerolen) ; + cn->bh = NULL ; + } +next: + cn = cn->next ; + } + /* the first run through the loop sends all the dirty buffers to + ** ll_rw_block. + ** the second run through the loop does all the accounting + */ + if (run++ == 0) { + goto loop_start ; + } + + atomic_set(&(jl->j_flushing), 0) ; + wake_up(&(jl->j_flush_wait)) ; + return ret ; +} +/* since we never give dirty buffers to bdflush/kupdate, we have to +** flush them ourselves. This runs through the journal lists, finds +** old metadata in need of flushing and sends it to disk. +** this does not end transactions, commit anything, or free +** cnodes. +** +** returns the highest transaction id that was flushed last time +*/ +static unsigned long reiserfs_journal_kupdate(struct super_block *s) { + struct reiserfs_journal_list *jl ; + int i ; + int start ; + time_t age ; + int ret = 0 ; + + start = SB_JOURNAL_LIST_INDEX(s) ; + + /* safety check to prevent flush attempts during a mount */ + if (start < 0) { + return 0 ; + } + i = (start + 1) % JOURNAL_LIST_COUNT ; + while(i != start) { + jl = SB_JOURNAL_LIST(s) + i ; + age = CURRENT_TIME - jl->j_timestamp ; + if (jl->j_len > 0 && // age >= (JOURNAL_MAX_COMMIT_AGE * 2) && + atomic_read(&(jl->j_nonzerolen)) > 0 && + atomic_read(&(jl->j_commit_left)) == 0) { + + if (jl->j_trans_id == SB_JOURNAL(s)->j_trans_id) { + break ; + } + /* if ret was already 1, we want to preserve that */ + ret |= kupdate_one_transaction(s, jl) ; + } + if (atomic_read(&(jl->j_nonzerolen)) > 0) { + ret |= 1 ; + } + i = (i + 1) % JOURNAL_LIST_COUNT ; + } + return ret ; +} + +/* +** removes any nodes in table with name block and dev as bh. +** only touchs the hnext and hprev pointers. +*/ +void remove_journal_hash(struct reiserfs_journal_cnode **table, struct reiserfs_journal_list *jl,struct buffer_head *bh, + int remove_freed){ + struct reiserfs_journal_cnode *cur ; + struct reiserfs_journal_cnode **head ; + + if (!bh) + return ; + + head= &(journal_hash(table, bh->b_dev, bh->b_blocknr)) ; + if (!head) { + return ; + } + cur = *head ; + while(cur) { + if (cur->blocknr == bh->b_blocknr && cur->dev == bh->b_dev && (jl == NULL || jl == cur->jlist) && + (!test_bit(BLOCK_FREED, &cur->state) || remove_freed)) { + if (cur->hnext) { + cur->hnext->hprev = cur->hprev ; + } + if (cur->hprev) { + cur->hprev->hnext = cur->hnext ; + } else { + *head = cur->hnext ; + } + cur->blocknr = 0 ; + cur->dev = 0 ; + cur->state = 0 ; + if (cur->bh && cur->jlist) /* anybody who clears the cur->bh will also dec the nonzerolen */ + atomic_dec(&(cur->jlist->j_nonzerolen)) ; + cur->bh = NULL ; + cur->jlist = NULL ; + } + cur = cur->hnext ; + } +} + +static void free_journal_ram(struct super_block *p_s_sb) { + vfree(SB_JOURNAL(p_s_sb)->j_cnode_free_orig) ; + free_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap) ; + free_bitmap_nodes(p_s_sb) ; /* must be after free_list_bitmaps */ + if (SB_JOURNAL(p_s_sb)->j_header_bh) { + brelse(SB_JOURNAL(p_s_sb)->j_header_bh) ; + } + vfree(SB_JOURNAL(p_s_sb)) ; +} + +/* +** call on unmount. Only set error to 1 if you haven't made your way out +** of read_super() yet. Any other caller must keep error at 0. +*/ +static int do_journal_release(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, int error) { + struct reiserfs_transaction_handle myth ; + + /* we only want to flush out transactions if we were called with error == 0 + */ + if (!error && !(p_s_sb->s_flags & MS_RDONLY)) { + /* end the current trans */ + do_journal_end(th, p_s_sb,10, FLUSH_ALL) ; + + /* make sure something gets logged to force our way into the flush code */ + journal_join(&myth, p_s_sb, 1) ; + reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; + journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; + do_journal_end(&myth, p_s_sb,1, FLUSH_ALL) ; + } + + /* we decrement before we wake up, because the commit thread dies off + ** when it has been woken up and the count is <= 0 + */ + reiserfs_mounted_fs_count-- ; + wake_up(&reiserfs_commit_thread_wait) ; + sleep_on(&reiserfs_commit_thread_done) ; + + free_journal_ram(p_s_sb) ; + + return 0 ; +} + +/* +** call on unmount. flush all journal trans, release all alloc'd ram +*/ +int journal_release(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb) { + return do_journal_release(th, p_s_sb, 0) ; +} +/* +** only call from an error condition inside reiserfs_read_super! +*/ +int journal_release_error(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb) { + return do_journal_release(th, p_s_sb, 1) ; +} + +/* compares description block with commit block. returns 1 if they differ, 0 if they are the same */ +static int journal_compare_desc_commit(struct super_block *p_s_sb, struct reiserfs_journal_desc *desc, + struct reiserfs_journal_commit *commit) { + if (le32_to_cpu(commit->j_trans_id) != le32_to_cpu(desc->j_trans_id) || + le32_to_cpu(commit->j_len) != le32_to_cpu(desc->j_len) || + le32_to_cpu(commit->j_len) > JOURNAL_TRANS_MAX || + le32_to_cpu(commit->j_len) <= 0 + ) { + return 1 ; + } + return 0 ; +} +/* returns 0 if it did not find a description block +** returns -1 if it found a corrupt commit block +** returns 1 if both desc and commit were valid +*/ +static int journal_transaction_is_valid(struct super_block *p_s_sb, struct buffer_head *d_bh, unsigned long *oldest_invalid_trans_id, unsigned long *newest_mount_id) { + struct reiserfs_journal_desc *desc ; + struct reiserfs_journal_commit *commit ; + struct buffer_head *c_bh ; + unsigned long offset ; + + desc = (struct reiserfs_journal_desc *)d_bh->b_data ; + if (le32_to_cpu(desc->j_len) > 0 && !memcmp(desc->j_magic, JOURNAL_DESC_MAGIC, 8)) { + if (oldest_invalid_trans_id && *oldest_invalid_trans_id && le32_to_cpu(desc->j_trans_id) > *oldest_invalid_trans_id) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-986: transaction " + "is valid returning because trans_id %d is greater than " + "oldest_invalid %lu\n", le32_to_cpu(desc->j_trans_id), + *oldest_invalid_trans_id); + return 0 ; + } + if (newest_mount_id && *newest_mount_id > le32_to_cpu(desc->j_mount_id)) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1087: transaction " + "is valid returning because mount_id %d is less than " + "newest_mount_id %lu\n", desc->j_mount_id, + *newest_mount_id) ; + return -1 ; + } + offset = d_bh->b_blocknr - reiserfs_get_journal_block(p_s_sb) ; + + /* ok, we have a journal description block, lets see if the transaction was valid */ + c_bh = bread(p_s_sb->s_dev, reiserfs_get_journal_block(p_s_sb) + ((offset + le32_to_cpu(desc->j_len) + 1) % JOURNAL_BLOCK_COUNT), + p_s_sb->s_blocksize) ; + if (!c_bh) + return 0 ; + commit = (struct reiserfs_journal_commit *)c_bh->b_data ; + if (journal_compare_desc_commit(p_s_sb, desc, commit)) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, + "journal_transaction_is_valid, commit offset %ld had bad " + "time %d or length %d\n", + c_bh->b_blocknr - reiserfs_get_journal_block(p_s_sb), + le32_to_cpu(commit->j_trans_id), + le32_to_cpu(commit->j_len)); + brelse(c_bh) ; + if (oldest_invalid_trans_id) + *oldest_invalid_trans_id = le32_to_cpu(desc->j_trans_id) ; + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1004: " + "transaction_is_valid setting oldest invalid trans_id " + "to %d\n", le32_to_cpu(desc->j_trans_id)) ; + return -1; + } + brelse(c_bh) ; + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1006: found valid " + "transaction start offset %lu, len %d id %d\n", + d_bh->b_blocknr - reiserfs_get_journal_block(p_s_sb), + le32_to_cpu(desc->j_len), le32_to_cpu(desc->j_trans_id)) ; + return 1 ; + } else { + return 0 ; + } +} + +static void brelse_array(struct buffer_head **heads, int num) { + int i ; + for (i = 0 ; i < num ; i++) { + brelse(heads[i]) ; + } +} + +/* +** given the start, and values for the oldest acceptable transactions, +** this either reads in a replays a transaction, or returns because the transaction +** is invalid, or too old. +*/ +static int journal_read_transaction(struct super_block *p_s_sb, unsigned long cur_dblock, unsigned long oldest_start, + unsigned long oldest_trans_id, unsigned long newest_mount_id) { + struct reiserfs_journal_desc *desc ; + struct reiserfs_journal_commit *commit ; + unsigned long trans_id = 0 ; + struct buffer_head *c_bh ; + struct buffer_head *d_bh ; + struct buffer_head **log_blocks = NULL ; + struct buffer_head **real_blocks = NULL ; + unsigned long trans_offset ; + int i; + + d_bh = bread(p_s_sb->s_dev, cur_dblock, p_s_sb->s_blocksize) ; + if (!d_bh) + return 1 ; + desc = (struct reiserfs_journal_desc *)d_bh->b_data ; + trans_offset = d_bh->b_blocknr - reiserfs_get_journal_block(p_s_sb) ; + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: " + "journal_read_transaction, offset %lu, len %d mount_id %d\n", + d_bh->b_blocknr - reiserfs_get_journal_block(p_s_sb), + le32_to_cpu(desc->j_len), le32_to_cpu(desc->j_mount_id)) ; + if (le32_to_cpu(desc->j_trans_id) < oldest_trans_id) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1039: " + "journal_read_trans skipping because %lu is too old\n", + cur_dblock - reiserfs_get_journal_block(p_s_sb)) ; + brelse(d_bh) ; + return 1 ; + } + if (le32_to_cpu(desc->j_mount_id) != newest_mount_id) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1146: " + "journal_read_trans skipping because %d is != " + "newest_mount_id %lu\n", le32_to_cpu(desc->j_mount_id), + newest_mount_id) ; + brelse(d_bh) ; + return 1 ; + } + c_bh = bread(p_s_sb->s_dev, reiserfs_get_journal_block(p_s_sb) + ((trans_offset + le32_to_cpu(desc->j_len) + 1) % JOURNAL_BLOCK_COUNT), + p_s_sb->s_blocksize) ; + if (!c_bh) { + brelse(d_bh) ; + return 1 ; + } + commit = (struct reiserfs_journal_commit *)c_bh->b_data ; + if (journal_compare_desc_commit(p_s_sb, desc, commit)) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal_read_transaction, " + "commit offset %ld had bad time %d or length %d\n", + c_bh->b_blocknr - reiserfs_get_journal_block(p_s_sb), + le32_to_cpu(commit->j_trans_id), le32_to_cpu(commit->j_len)); + brelse(c_bh) ; + brelse(d_bh) ; + return 1; + } + trans_id = le32_to_cpu(desc->j_trans_id) ; + /* now we know we've got a good transaction, and it was inside the valid time ranges */ + log_blocks = kmalloc(le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), GFP_BUFFER) ; + real_blocks = kmalloc(le32_to_cpu(desc->j_len) * sizeof(struct buffer_head *), GFP_BUFFER) ; + if (!log_blocks || !real_blocks) { + brelse(c_bh) ; + brelse(d_bh) ; + kfree(log_blocks) ; + kfree(real_blocks) ; + reiserfs_warning("journal-1169: kmalloc failed, unable to mount FS\n") ; + return -1 ; + } + /* get all the buffer heads */ + for(i = 0 ; i < le32_to_cpu(desc->j_len) ; i++) { + log_blocks[i] = getblk(p_s_sb->s_dev, reiserfs_get_journal_block(p_s_sb) + (trans_offset + 1 + i) % JOURNAL_BLOCK_COUNT, p_s_sb->s_blocksize); + if (i < JOURNAL_TRANS_HALF) { + real_blocks[i] = getblk(p_s_sb->s_dev, le32_to_cpu(desc->j_realblock[i]), p_s_sb->s_blocksize) ; + } else { + real_blocks[i] = getblk(p_s_sb->s_dev, le32_to_cpu(commit->j_realblock[i - JOURNAL_TRANS_HALF]), p_s_sb->s_blocksize) ; + } + if (real_blocks[i]->b_blocknr >= reiserfs_get_journal_block(p_s_sb) && + real_blocks[i]->b_blocknr < (reiserfs_get_journal_block(p_s_sb)+JOURNAL_BLOCK_COUNT)) { + reiserfs_warning("journal-1204: REPLAY FAILURE fsck required! Trying to replay onto a log block\n") ; + brelse_array(log_blocks, i) ; + brelse_array(real_blocks, i) ; + brelse(c_bh) ; + brelse(d_bh) ; + kfree(log_blocks) ; + kfree(real_blocks) ; + return -1 ; + } + } + /* read in the log blocks, memcpy to the corresponding real block */ + ll_rw_block(READ, le32_to_cpu(desc->j_len), log_blocks) ; + for (i = 0 ; i < le32_to_cpu(desc->j_len) ; i++) { + wait_on_buffer(log_blocks[i]) ; + if (!buffer_uptodate(log_blocks[i])) { + reiserfs_warning("journal-1212: REPLAY FAILURE fsck required! buffer write failed\n") ; + brelse_array(log_blocks + i, le32_to_cpu(desc->j_len) - i) ; + brelse_array(real_blocks, le32_to_cpu(desc->j_len)) ; + brelse(c_bh) ; + brelse(d_bh) ; + kfree(log_blocks) ; + kfree(real_blocks) ; + return -1 ; + } + memcpy(real_blocks[i]->b_data, log_blocks[i]->b_data, real_blocks[i]->b_size) ; + mark_buffer_uptodate(real_blocks[i], 1) ; + brelse(log_blocks[i]) ; + } + /* flush out the real blocks */ + for (i = 0 ; i < le32_to_cpu(desc->j_len) ; i++) { + set_bit(BH_Dirty, &(real_blocks[i]->b_state)) ; + ll_rw_block(WRITE, 1, real_blocks + i) ; + } + for (i = 0 ; i < le32_to_cpu(desc->j_len) ; i++) { + wait_on_buffer(real_blocks[i]) ; + if (!buffer_uptodate(real_blocks[i])) { + reiserfs_warning("journal-1226: REPLAY FAILURE, fsck required! buffer write failed\n") ; + brelse_array(real_blocks + i, le32_to_cpu(desc->j_len) - i) ; + brelse(c_bh) ; + brelse(d_bh) ; + kfree(log_blocks) ; + kfree(real_blocks) ; + return -1 ; + } + brelse(real_blocks[i]) ; + } + cur_dblock = reiserfs_get_journal_block(p_s_sb) + ((trans_offset + le32_to_cpu(desc->j_len) + 2) % JOURNAL_BLOCK_COUNT) ; + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1095: setting journal " + "start to offset %ld\n", + cur_dblock - reiserfs_get_journal_block(p_s_sb)) ; + + /* init starting values for the first transaction, in case this is the last transaction to be replayed. */ + SB_JOURNAL(p_s_sb)->j_start = cur_dblock - reiserfs_get_journal_block(p_s_sb) ; + SB_JOURNAL(p_s_sb)->j_last_flush_trans_id = trans_id ; + SB_JOURNAL(p_s_sb)->j_trans_id = trans_id + 1; + brelse(c_bh) ; + brelse(d_bh) ; + kfree(log_blocks) ; + kfree(real_blocks) ; + return 0 ; +} + +/* +** read and replay the log +** on a clean unmount, the journal header's next unflushed pointer will be to an invalid +** transaction. This tests that before finding all the transactions in the log, whic makes normal mount times fast. +** +** After a crash, this starts with the next unflushed transaction, and replays until it finds one too old, or invalid. +** +** On exit, it sets things up so the first transaction will work correctly. +*/ +static int journal_read(struct super_block *p_s_sb) { + struct reiserfs_journal_desc *desc ; + unsigned long last_flush_trans_id = 0 ; + unsigned long oldest_trans_id = 0; + unsigned long oldest_invalid_trans_id = 0 ; + time_t start ; + unsigned long last_flush_start = 0; + unsigned long oldest_start = 0; + unsigned long cur_dblock = 0 ; + unsigned long newest_mount_id = 9 ; + struct buffer_head *d_bh ; + struct reiserfs_journal_header *jh ; + int valid_journal_header = 0 ; + int replay_count = 0 ; + int continue_replay = 1 ; + int ret ; + + cur_dblock = reiserfs_get_journal_block(p_s_sb) ; + printk("reiserfs: checking transaction log (device %s) ...\n", + kdevname(p_s_sb->s_dev)) ; + start = CURRENT_TIME ; + + /* step 1, read in the journal header block. Check the transaction it says + ** is the first unflushed, and if that transaction is not valid, + ** replay is done + */ + SB_JOURNAL(p_s_sb)->j_header_bh = bread(p_s_sb->s_dev, + reiserfs_get_journal_block(p_s_sb) + + JOURNAL_BLOCK_COUNT, + p_s_sb->s_blocksize) ; + if (!SB_JOURNAL(p_s_sb)->j_header_bh) { + return 1 ; + } + jh = (struct reiserfs_journal_header *)(SB_JOURNAL(p_s_sb)->j_header_bh->b_data) ; + if (le32_to_cpu(jh->j_first_unflushed_offset) >= 0 && + le32_to_cpu(jh->j_first_unflushed_offset) < JOURNAL_BLOCK_COUNT && + le32_to_cpu(jh->j_last_flush_trans_id) > 0) { + last_flush_start = reiserfs_get_journal_block(p_s_sb) + + le32_to_cpu(jh->j_first_unflushed_offset) ; + last_flush_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) ; + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1153: found in " + "header: first_unflushed_offset %d, last_flushed_trans_id " + "%lu\n", le32_to_cpu(jh->j_first_unflushed_offset), + last_flush_trans_id) ; + valid_journal_header = 1 ; + + /* now, we try to read the first unflushed offset. If it is not valid, + ** there is nothing more we can do, and it makes no sense to read + ** through the whole log. + */ + d_bh = bread(p_s_sb->s_dev, reiserfs_get_journal_block(p_s_sb) + le32_to_cpu(jh->j_first_unflushed_offset), p_s_sb->s_blocksize) ; + ret = journal_transaction_is_valid(p_s_sb, d_bh, NULL, NULL) ; + if (!ret) { + continue_replay = 0 ; + } + brelse(d_bh) ; + } + + if (continue_replay && is_read_only(p_s_sb->s_dev)) { + printk("clm-2076: device is readonly, unable to replay log\n") ; + brelse(SB_JOURNAL(p_s_sb)->j_header_bh) ; + SB_JOURNAL(p_s_sb)->j_header_bh = NULL ; + return -1 ; + } + if (continue_replay && (p_s_sb->s_flags & MS_RDONLY)) { + printk("Warning, log replay starting on readonly filesystem\n") ; + } + + /* ok, there are transactions that need to be replayed. start with the first log block, find + ** all the valid transactions, and pick out the oldest. + */ + while(continue_replay && cur_dblock < (reiserfs_get_journal_block(p_s_sb) + JOURNAL_BLOCK_COUNT)) { + d_bh = bread(p_s_sb->s_dev, cur_dblock, p_s_sb->s_blocksize) ; + ret = journal_transaction_is_valid(p_s_sb, d_bh, &oldest_invalid_trans_id, &newest_mount_id) ; + if (ret == 1) { + desc = (struct reiserfs_journal_desc *)d_bh->b_data ; + if (oldest_start == 0) { /* init all oldest_ values */ + oldest_trans_id = le32_to_cpu(desc->j_trans_id) ; + oldest_start = d_bh->b_blocknr ; + newest_mount_id = le32_to_cpu(desc->j_mount_id) ; + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1179: Setting " + "oldest_start to offset %lu, trans_id %lu\n", + oldest_start - reiserfs_get_journal_block(p_s_sb), + oldest_trans_id) ; + } else if (oldest_trans_id > le32_to_cpu(desc->j_trans_id)) { + /* one we just read was older */ + oldest_trans_id = le32_to_cpu(desc->j_trans_id) ; + oldest_start = d_bh->b_blocknr ; + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1180: Resetting " + "oldest_start to offset %lu, trans_id %lu\n", + oldest_start - reiserfs_get_journal_block(p_s_sb), + oldest_trans_id) ; + } + if (newest_mount_id < le32_to_cpu(desc->j_mount_id)) { + newest_mount_id = le32_to_cpu(desc->j_mount_id) ; + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting " + "newest_mount_id to %d\n", le32_to_cpu(desc->j_mount_id)); + } + cur_dblock += le32_to_cpu(desc->j_len) + 2 ; + } + else { + cur_dblock++ ; + } + brelse(d_bh) ; + } + /* step three, starting at the oldest transaction, replay */ + if (last_flush_start > 0) { + oldest_start = last_flush_start ; + oldest_trans_id = last_flush_trans_id ; + } + cur_dblock = oldest_start ; + if (oldest_trans_id) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1206: Starting replay " + "from offset %lu, trans_id %lu\n", + cur_dblock - reiserfs_get_journal_block(p_s_sb), + oldest_trans_id) ; + + } + replay_count = 0 ; + while(continue_replay && oldest_trans_id > 0) { + ret = journal_read_transaction(p_s_sb, cur_dblock, oldest_start, oldest_trans_id, newest_mount_id) ; + if (ret < 0) { + return ret ; + } else if (ret != 0) { + break ; + } + cur_dblock = reiserfs_get_journal_block(p_s_sb) + SB_JOURNAL(p_s_sb)->j_start ; + replay_count++ ; + } + + if (oldest_trans_id == 0) { + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1225: No valid " + "transactions found\n") ; + } + /* j_start does not get set correctly if we don't replay any transactions. + ** if we had a valid journal_header, set j_start to the first unflushed transaction value, + ** copy the trans_id from the header + */ + if (valid_journal_header && replay_count == 0) { + SB_JOURNAL(p_s_sb)->j_start = le32_to_cpu(jh->j_first_unflushed_offset) ; + SB_JOURNAL(p_s_sb)->j_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) + 1; + SB_JOURNAL(p_s_sb)->j_last_flush_trans_id = le32_to_cpu(jh->j_last_flush_trans_id) ; + SB_JOURNAL(p_s_sb)->j_mount_id = le32_to_cpu(jh->j_mount_id) + 1; + } else { + SB_JOURNAL(p_s_sb)->j_mount_id = newest_mount_id + 1 ; + } + reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1299: Setting " + "newest_mount_id to %lu\n", SB_JOURNAL(p_s_sb)->j_mount_id) ; + SB_JOURNAL(p_s_sb)->j_first_unflushed_offset = SB_JOURNAL(p_s_sb)->j_start ; + if (replay_count > 0) { + printk("reiserfs: replayed %d transactions in %lu seconds\n", replay_count, + CURRENT_TIME - start) ; + } + if (!is_read_only(p_s_sb->s_dev)) { + update_journal_header_block(p_s_sb, SB_JOURNAL(p_s_sb)->j_start, + SB_JOURNAL(p_s_sb)->j_last_flush_trans_id) ; + } + return 0 ; +} + + +struct reiserfs_journal_commit_task { + struct super_block *p_s_sb ; + int jindex ; + int wake_on_finish ; /* if this is one, we wake the task_done queue, if it + ** is zero, we free the whole struct on finish + */ + struct reiserfs_journal_commit_task *self ; + struct wait_queue *task_done ; + struct tq_struct task ; +} ; + +static void reiserfs_journal_commit_task_func(struct reiserfs_journal_commit_task *ct) { + + struct reiserfs_journal_list *jl ; + jl = SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex ; + + flush_commit_list(ct->p_s_sb, SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex, 1) ; + if (jl->j_len > 0 && atomic_read(&(jl->j_nonzerolen)) > 0 && + atomic_read(&(jl->j_commit_left)) == 0) { + kupdate_one_transaction(ct->p_s_sb, jl) ; + } + kfree(ct->self) ; +} + +static void setup_commit_task_arg(struct reiserfs_journal_commit_task *ct, + struct super_block *p_s_sb, + int jindex) { + if (!ct) { + reiserfs_panic(NULL, "journal-1360: setup_commit_task_arg called with NULL struct\n") ; + } + ct->p_s_sb = p_s_sb ; + ct->jindex = jindex ; + ct->task_done = NULL ; + INIT_LIST_HEAD(&ct->task.list) ; + ct->task.sync = 0 ; + ct->task.routine = (void *)(void *)reiserfs_journal_commit_task_func ; + ct->self = ct ; + ct->task.data = (void *)ct ; +} + +static void commit_flush_async(struct super_block *p_s_sb, int jindex) { + struct reiserfs_journal_commit_task *ct ; + /* using GFP_BUFFER, GFP_KERNEL could try to flush inodes, which will try + ** to start/join a transaction, which will deadlock + */ + ct = kmalloc(sizeof(struct reiserfs_journal_commit_task), GFP_BUFFER) ; + if (ct) { + setup_commit_task_arg(ct, p_s_sb, jindex) ; + queue_task(&(ct->task), &reiserfs_commit_thread_tq); + wake_up(&reiserfs_commit_thread_wait) ; + } else { +#ifdef CONFIG_REISERFS_CHECK + reiserfs_warning("journal-1540: kmalloc failed, doing sync commit\n") ; +#endif + flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ; + } +} + +/* +** this is the commit thread. It is started with kernel_thread on +** FS mount, and journal_release() waits for it to exit. +** +** It could do a periodic commit, but there is a lot code for that +** elsewhere right now, and I only wanted to implement this little +** piece for starters. +** +** All we do here is sleep on the j_commit_thread_wait wait queue, and +** then run the per filesystem commit task queue when we wakeup. +*/ +static int reiserfs_journal_commit_thread(void *nullp) { + exit_files(current); + exit_mm(current); + + spin_lock_irq(¤t->sigmask_lock); + sigfillset(¤t->blocked); + recalc_sigpending(current); + spin_unlock_irq(¤t->sigmask_lock); + + current->session = 1; + current->pgrp = 1; + sprintf(current->comm, "kreiserfsd") ; + lock_kernel() ; + while(1) { + + while(TQ_ACTIVE(reiserfs_commit_thread_tq)) { + run_task_queue(&reiserfs_commit_thread_tq) ; + } + + /* if there aren't any more filesystems left, break */ + if (reiserfs_mounted_fs_count <= 0) { + run_task_queue(&reiserfs_commit_thread_tq) ; + break ; + } + wake_up(&reiserfs_commit_thread_done) ; + interruptible_sleep_on_timeout(&reiserfs_commit_thread_wait, 5) ; + } + unlock_kernel() ; + wake_up(&reiserfs_commit_thread_done) ; + return 0 ; +} + +static void journal_list_init(struct super_block *p_s_sb) { + int i ; + for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) { + init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_commit_wait)) ; + init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_flush_wait)) ; + } +} + +/* +** must be called once on fs mount. calls journal_read for you +*/ +int journal_init(struct super_block *p_s_sb) { + int num_cnodes = JOURNAL_BLOCK_COUNT * 2 ; + + if (sizeof(struct reiserfs_journal_commit) != 4096 || + sizeof(struct reiserfs_journal_desc) != 4096 + ) { + printk("journal-1249: commit or desc struct not 4096 %Zd %Zd\n", sizeof(struct reiserfs_journal_commit), + sizeof(struct reiserfs_journal_desc)) ; + return 1 ; + } + /* sanity check to make sure they don't overflow the journal */ + if (JOURNAL_BLOCK_COUNT > reiserfs_get_journal_orig_size(p_s_sb)) { + printk("journal-1393: current JOURNAL_BLOCK_COUNT (%d) is too big. This FS was created with a journal size of %lu blocks\n", + JOURNAL_BLOCK_COUNT, reiserfs_get_journal_orig_size(p_s_sb)) ; + return 1 ; + } + SB_JOURNAL(p_s_sb) = vmalloc(sizeof (struct reiserfs_journal)) ; + + if (!SB_JOURNAL(p_s_sb)) { + printk("journal-1256: unable to get memory for journal structure\n") ; + return 1 ; + } + memset(SB_JOURNAL(p_s_sb), 0, sizeof(struct reiserfs_journal)) ; + + SB_JOURNAL(p_s_sb)->j_list_bitmap_index = 0 ; + SB_JOURNAL_LIST_INDEX(p_s_sb) = -10000 ; /* make sure flush_old_commits does not try to flush a list while replay is on */ + + /* clear out the journal list array */ + memset(SB_JOURNAL_LIST(p_s_sb), 0, sizeof(struct reiserfs_journal_list) * JOURNAL_LIST_COUNT) ; + journal_list_init(p_s_sb) ; + + memset(SB_JOURNAL(p_s_sb)->j_list_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ; + memset(journal_writers, 0, sizeof(char *) * 512) ; /* debug code */ + + INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_bitmap_nodes) ; + reiserfs_allocate_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap, + SB_BMAP_NR(p_s_sb)) ; + allocate_bitmap_nodes(p_s_sb) ; + + SB_JOURNAL(p_s_sb)->j_start = 0 ; + SB_JOURNAL(p_s_sb)->j_len = 0 ; + SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ; + atomic_set(&(SB_JOURNAL(p_s_sb)->j_wcount), 0) ; + SB_JOURNAL(p_s_sb)->j_bcount = 0 ; + SB_JOURNAL(p_s_sb)->j_trans_start_time = 0 ; + SB_JOURNAL(p_s_sb)->j_last = NULL ; + SB_JOURNAL(p_s_sb)->j_first = NULL ; + init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ; + init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_wait)) ; + + SB_JOURNAL(p_s_sb)->j_trans_id = 10 ; + SB_JOURNAL(p_s_sb)->j_mount_id = 10 ; + SB_JOURNAL(p_s_sb)->j_state = 0 ; + atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ; + atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 0) ; + SB_JOURNAL(p_s_sb)->j_cnode_free_list = allocate_cnodes(num_cnodes) ; + SB_JOURNAL(p_s_sb)->j_cnode_free_orig = SB_JOURNAL(p_s_sb)->j_cnode_free_list ; + SB_JOURNAL(p_s_sb)->j_cnode_free = SB_JOURNAL(p_s_sb)->j_cnode_free_list ? num_cnodes : 0 ; + SB_JOURNAL(p_s_sb)->j_cnode_used = 0 ; + SB_JOURNAL(p_s_sb)->j_must_wait = 0 ; + init_journal_hash(p_s_sb) ; + SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb)) ; + if (!(SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap)) { + reiserfs_warning("journal-2005, get_list_bitmap failed for journal list 0\n") ; + return 1 ; + } + if (journal_read(p_s_sb) < 0) { + reiserfs_warning("Replay Failure, unable to mount\n") ; + free_journal_ram(p_s_sb) ; + return 1 ; + } + SB_JOURNAL_LIST_INDEX(p_s_sb) = 0 ; /* once the read is done, we can set this where it belongs */ + + if (reiserfs_dont_log (p_s_sb)) + return 0; + + reiserfs_mounted_fs_count++ ; + if (reiserfs_mounted_fs_count <= 1) { + kernel_thread((void *)(void *)reiserfs_journal_commit_thread, NULL, + CLONE_FS | CLONE_FILES | CLONE_VM) ; + } + return 0 ; +} + +/* +** test for a polite end of the current transaction. Used by file_write, and should +** be used by delete to make sure they don't write more than can fit inside a single +** transaction +*/ +int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) { + time_t now = CURRENT_TIME ; + if (reiserfs_dont_log(th->t_super)) + return 0 ; + if ( SB_JOURNAL(th->t_super)->j_must_wait > 0 || + (SB_JOURNAL(th->t_super)->j_len_alloc + new_alloc) >= JOURNAL_MAX_BATCH || + atomic_read(&(SB_JOURNAL(th->t_super)->j_jlock)) || + (now - SB_JOURNAL(th->t_super)->j_trans_start_time) > JOURNAL_MAX_TRANS_AGE || + SB_JOURNAL(th->t_super)->j_cnode_free < (JOURNAL_TRANS_MAX * 3)) { + return 1 ; + } + return 0 ; +} + +/* this must be called inside a transaction, and requires the +** kernel_lock to be held +*/ +void reiserfs_block_writes(struct reiserfs_transaction_handle *th) { + struct super_block *s = th->t_super ; + SB_JOURNAL(s)->j_must_wait = 1 ; + set_bit(WRITERS_BLOCKED, &SB_JOURNAL(s)->j_state) ; + return ; +} + +/* this must be called without a transaction started, and does not +** require BKL +*/ +void reiserfs_allow_writes(struct super_block *s) { + clear_bit(WRITERS_BLOCKED, &SB_JOURNAL(s)->j_state) ; + wake_up(&SB_JOURNAL(s)->j_join_wait) ; +} + +/* this must be called without a transaction started, and does not +** require BKL +*/ +void reiserfs_wait_on_write_block(struct super_block *s) { + wait_event(SB_JOURNAL(s)->j_join_wait, + !test_bit(WRITERS_BLOCKED, &SB_JOURNAL(s)->j_state)) ; +} + +/* join == true if you must join an existing transaction. +** join == false if you can deal with waiting for others to finish +** +** this will block until the transaction is joinable. send the number of blocks you +** expect to use in nblocks. +*/ +static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb,unsigned long nblocks,int join) { + time_t now = CURRENT_TIME ; + int old_trans_id ; + + reiserfs_check_lock_depth("journal_begin") ; +#ifdef CONFIG_REISERFS_CHECK + if (p_s_sb->s_flags & MS_RDONLY) { + printk("clm-2078: calling journal_begin on readonly FS\n") ; + BUG() ; + } +#endif + + if (reiserfs_dont_log(p_s_sb)) { + th->t_super = p_s_sb ; /* others will check this for the don't log flag */ + return 0 ; + } + +relock: + lock_journal(p_s_sb) ; + + if (test_bit(WRITERS_BLOCKED, &SB_JOURNAL(p_s_sb)->j_state)) { + unlock_journal(p_s_sb) ; + reiserfs_wait_on_write_block(p_s_sb) ; + goto relock ; + } + + /* if there is no room in the journal OR + ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning + ** we don't sleep if there aren't other writers + */ + + + if ( (!join && SB_JOURNAL(p_s_sb)->j_must_wait > 0) || + ( !join && (SB_JOURNAL(p_s_sb)->j_len_alloc + nblocks + 2) >= JOURNAL_MAX_BATCH) || + (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0 && SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && + (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > JOURNAL_MAX_TRANS_AGE) || + (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) ) || + (!join && SB_JOURNAL(p_s_sb)->j_cnode_free < (JOURNAL_TRANS_MAX * 3))) { + + unlock_journal(p_s_sb) ; /* allow others to finish this transaction */ + + /* if writer count is 0, we can just force this transaction to end, and start + ** a new one afterwards. + */ + if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) { + struct reiserfs_transaction_handle myth ; + journal_join(&myth, p_s_sb, 1) ; + reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; + journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; + do_journal_end(&myth, p_s_sb,1,COMMIT_NOW) ; + } else { + /* but if the writer count isn't zero, we have to wait for the current writers to finish. + ** They won't batch on transaction end once we set j_jlock + */ + atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ; + old_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ; + while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) && + SB_JOURNAL(p_s_sb)->j_trans_id == old_trans_id) { + sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ; + } + } + lock_journal(p_s_sb) ; /* relock to continue */ + } + + if (SB_JOURNAL(p_s_sb)->j_trans_start_time == 0) { /* we are the first writer, set trans_id */ + SB_JOURNAL(p_s_sb)->j_trans_start_time = now ; + } + atomic_inc(&(SB_JOURNAL(p_s_sb)->j_wcount)) ; + SB_JOURNAL(p_s_sb)->j_len_alloc += nblocks ; + th->t_blocks_logged = 0 ; + th->t_blocks_allocated = nblocks ; + th->t_super = p_s_sb ; + th->t_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ; + th->t_caller = "Unknown" ; + unlock_journal(p_s_sb) ; + p_s_sb->s_dirt = 1; + return 0 ; +} + + +int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { + return do_journal_begin_r(th, p_s_sb, nblocks, 1) ; +} + +int journal_begin(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks) { + return do_journal_begin_r(th, p_s_sb, nblocks, 0) ; +} + +/* not used at all */ +int journal_prepare(struct super_block * p_s_sb, struct buffer_head *bh) { + return 0 ; +} + +/* +** puts bh into the current transaction. If it was already there, reorders removes the +** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order). +** +** if it was dirty, cleans and files onto the clean list. I can't let it be dirty again until the +** transaction is committed. +** +** if j_len, is bigger than j_len_alloc, it pushes j_len_alloc to 10 + j_len. +*/ +int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, struct buffer_head *bh) { + struct reiserfs_journal_cnode *cn = NULL; + int count_already_incd = 0 ; + int prepared = 0 ; + + if (reiserfs_dont_log(th->t_super)) { + mark_buffer_dirty(bh) ; + return 0 ; + } + + if (th->t_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) { + reiserfs_panic(th->t_super, "journal-1577: handle trans id %d != current trans id %d\n", + th->t_trans_id, SB_JOURNAL(p_s_sb)->j_trans_id); + } + p_s_sb->s_dirt = 1 ; + + prepared = test_and_clear_bit(BH_JPrepared, &bh->b_state) ; + /* already in this transaction, we are done */ + if (buffer_journaled(bh)) { + return 0 ; + } + + /* this must be turned into a panic instead of a warning. We can't allow + ** a dirty or journal_dirty or locked buffer to be logged, as some changes + ** could get to disk too early. NOT GOOD. + */ + if (!prepared || buffer_locked(bh)) { + printk("journal-1777: buffer %lu bad state %cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT\n", bh->b_blocknr, prepared ? ' ' : '!', + buffer_locked(bh) ? ' ' : '!', + buffer_dirty(bh) ? ' ' : '!', + buffer_journal_dirty(bh) ? ' ' : '!') ; + show_reiserfs_locks() ; + } + count_already_incd = clear_prepared_bits(bh) ; + + if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) { + printk("journal-1409: journal_mark_dirty returning because j_wcount was %d\n", atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount))) ; + return 1 ; + } + /* this error means I've screwed up, and we've overflowed the transaction. + ** Nothing can be done here, except make the FS readonly or panic. + */ + if (SB_JOURNAL(p_s_sb)->j_len >= JOURNAL_TRANS_MAX) { + reiserfs_panic(th->t_super, "journal-1413: journal_mark_dirty: j_len (%lu) is too big\n", SB_JOURNAL(p_s_sb)->j_len) ; + } + + if (buffer_journal_dirty(bh)) { + count_already_incd = 1 ; + mark_buffer_notjournal_dirty(bh) ; + } + + if (buffer_dirty(bh)) { + clear_bit(BH_Dirty, &bh->b_state) ; + } + + if (buffer_journaled(bh)) { /* must double check after getting lock */ + goto done ; + } + + if (SB_JOURNAL(p_s_sb)->j_len > SB_JOURNAL(p_s_sb)->j_len_alloc) { + SB_JOURNAL(p_s_sb)->j_len_alloc = SB_JOURNAL(p_s_sb)->j_len + JOURNAL_PER_BALANCE_CNT ; + } + + set_bit(BH_JDirty, &bh->b_state) ; + + /* now put this guy on the end */ + if (!cn) { + cn = get_cnode(p_s_sb) ; + if (!cn) { + reiserfs_panic(p_s_sb, "get_cnode failed!\n"); + } + + if (th->t_blocks_logged == th->t_blocks_allocated) { + th->t_blocks_allocated += JOURNAL_PER_BALANCE_CNT ; + SB_JOURNAL(p_s_sb)->j_len_alloc += JOURNAL_PER_BALANCE_CNT ; + } + th->t_blocks_logged++ ; + SB_JOURNAL(p_s_sb)->j_len++ ; + + cn->bh = bh ; + cn->blocknr = bh->b_blocknr ; + cn->dev = bh->b_dev ; + cn->jlist = NULL ; + insert_journal_hash(SB_JOURNAL(p_s_sb)->j_hash_table, cn) ; + if (!count_already_incd) { + atomic_inc(&(bh->b_count)) ; + } + } + cn->next = NULL ; + cn->prev = SB_JOURNAL(p_s_sb)->j_last ; + cn->bh = bh ; + if (SB_JOURNAL(p_s_sb)->j_last) { + SB_JOURNAL(p_s_sb)->j_last->next = cn ; + SB_JOURNAL(p_s_sb)->j_last = cn ; + } else { + SB_JOURNAL(p_s_sb)->j_first = cn ; + SB_JOURNAL(p_s_sb)->j_last = cn ; + } +done: + return 0 ; +} + +/* +** if buffer already in current transaction, do a journal_mark_dirty +** otherwise, just mark it dirty and move on. Used for writes to meta blocks +** that don't need journaling +*/ +int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, struct buffer_head *bh) { + if (reiserfs_dont_log(th->t_super) || buffer_journaled(bh) || + buffer_journal_dirty(bh)) { + return journal_mark_dirty(th, p_s_sb, bh) ; + } + if (get_journal_hash_dev(SB_JOURNAL(p_s_sb)->j_list_hash_table, bh->b_dev,bh->b_blocknr,bh->b_size)) { + return journal_mark_dirty(th, p_s_sb, bh) ; + } + mark_buffer_dirty(bh) ; + return 0 ; +} + +int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { + return do_journal_end(th, p_s_sb, nblocks, 0) ; +} + +/* removes from the current transaction, relsing and descrementing any counters. +** also files the removed buffer directly onto the clean list +** +** called by journal_mark_freed when a block has been deleted +** +** returns 1 if it cleaned and relsed the buffer. 0 otherwise +*/ +int remove_from_transaction(struct super_block *p_s_sb, unsigned long blocknr, int already_cleaned) { + struct buffer_head *bh ; + struct reiserfs_journal_cnode *cn ; + int ret = 0; + + cn = get_journal_hash_dev(SB_JOURNAL(p_s_sb)->j_hash_table, p_s_sb->s_dev, blocknr, p_s_sb->s_blocksize) ; + if (!cn || !cn->bh) { + return ret ; + } + bh = cn->bh ; + if (cn->prev) { + cn->prev->next = cn->next ; + } + if (cn->next) { + cn->next->prev = cn->prev ; + } + if (cn == SB_JOURNAL(p_s_sb)->j_first) { + SB_JOURNAL(p_s_sb)->j_first = cn->next ; + } + if (cn == SB_JOURNAL(p_s_sb)->j_last) { + SB_JOURNAL(p_s_sb)->j_last = cn->prev ; + } + remove_journal_hash(SB_JOURNAL(p_s_sb)->j_hash_table, NULL, bh, 0) ; + mark_buffer_not_journaled(bh) ; /* don't log this one */ + + if (!already_cleaned) { + mark_buffer_notjournal_dirty(bh) ; + atomic_dec(&(bh->b_count)) ; + if (atomic_read(&(bh->b_count)) < 0) { + printk("journal-1752: remove from trans, b_count < 0\n") ; + } + if (!buffer_locked(bh)) reiserfs_clean_and_file_buffer(bh) ; + ret = 1 ; + } + SB_JOURNAL(p_s_sb)->j_len-- ; + SB_JOURNAL(p_s_sb)->j_len_alloc-- ; + free_cnode(p_s_sb, cn) ; + return ret ; +} + +/* removes from a specific journal list hash */ +int remove_from_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, struct buffer_head *bh, int remove_freed) { + remove_journal_hash(SB_JOURNAL(s)->j_list_hash_table, jl, bh, remove_freed) ; + return 0 ; +} + +/* +** for any cnode in a journal list, it can only be dirtied of all the +** transactions that include it are commited to disk. +** this checks through each transaction, and returns 1 if you are allowed to dirty, +** and 0 if you aren't +** +** it is called by dirty_journal_list, which is called after flush_commit_list has gotten all the log +** blocks for a given transaction on disk +** +*/ +static int can_dirty(struct reiserfs_journal_cnode *cn) { + kdev_t dev = cn->dev ; + unsigned long blocknr = cn->blocknr ; + struct reiserfs_journal_cnode *cur = cn->hprev ; + int can_dirty = 1 ; + + /* first test hprev. These are all newer than cn, so any node here + ** with the name block number and dev means this node can't be sent + ** to disk right now. + */ + while(cur && can_dirty) { + if (cur->jlist && cur->bh && cur->blocknr && cur->dev == dev && + cur->blocknr == blocknr) { + can_dirty = 0 ; + } + cur = cur->hprev ; + } + /* then test hnext. These are all older than cn. As long as they + ** are committed to the log, it is safe to write cn to disk + */ + cur = cn->hnext ; + while(cur && can_dirty) { + if (cur->jlist && cur->jlist->j_len > 0 && + atomic_read(&(cur->jlist->j_commit_left)) > 0 && cur->bh && + cur->blocknr && cur->dev == dev && cur->blocknr == blocknr) { + can_dirty = 0 ; + } + cur = cur->hnext ; + } + return can_dirty ; +} + +/* syncs the commit blocks, but does not force the real buffers to disk +** will wait until the current transaction is done/commited before returning +*/ +int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { + return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT) ; +} + +#ifdef __KERNEL__ +int show_reiserfs_locks(void) { + + dump_journal_writers() ; +#if 0 /* debugging code for when we are compiled static don't delete */ + p_s_sb = sb_entry(super_blocks.next); + while (p_s_sb != sb_entry(&super_blocks)) { + if (reiserfs_is_super(p_s_sb)) { +printk("journal lock is %d, join lock is %d, writers %d must wait is %d\n", + atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)), + atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)), + atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)), + SB_JOURNAL(p_s_sb)->j_must_wait) ; + printk("used cnodes %d, free cnodes %d\n", SB_JOURNAL(p_s_sb)->j_cnode_used, SB_JOURNAL(p_s_sb)->j_cnode_free) ; + } + p_s_sb = sb_entry(p_s_sb->s_list.next); + } +#endif + return 0 ; +} +#endif + +/* +** used to get memory back from async commits that are floating around +** and to reclaim any blocks deleted but unusable because their commits +** haven't hit disk yet. called from bitmap.c +** +** if it starts flushing things, it ors SCHEDULE_OCCURRED into repeat. +** note, this is just if schedule has a chance of occuring. I need to +** change flush_commit_lists to have a repeat parameter too. +** +*/ +void flush_async_commits(struct super_block *p_s_sb) { + int i ; + + for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) { + if (i != SB_JOURNAL_LIST_INDEX(p_s_sb)) { + flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ; + } + } +} + +/* +** flushes any old transactions to disk +** ends the current transaction if it is too old +** +** also calls flush_journal_list with old_only == 1, which allows me to reclaim +** memory and such from the journal lists whose real blocks are all on disk. +** +** called by sync_dev_journal from buffer.c +*/ +int flush_old_commits(struct super_block *p_s_sb, int immediate) { + int i ; + int count = 0; + int start ; + time_t now ; + int keep_dirty = 0 ; + struct reiserfs_transaction_handle th ; + + start = SB_JOURNAL_LIST_INDEX(p_s_sb) ; + now = CURRENT_TIME ; + + /* safety check so we don't flush while we are replaying the log during mount */ + if (SB_JOURNAL_LIST_INDEX(p_s_sb) < 0) { + return 0 ; + } + if (!strcmp(current->comm, "kupdate")) { + immediate = 0 ; + keep_dirty = 1 ; + } + /* starting with oldest, loop until we get to the start */ + i = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ; + while(i != start) { + if (SB_JOURNAL_LIST(p_s_sb)[i].j_len > 0 && ((now - SB_JOURNAL_LIST(p_s_sb)[i].j_timestamp) > JOURNAL_MAX_COMMIT_AGE || + immediate)) { + /* we have to check again to be sure the current transaction did not change */ + if (i != SB_JOURNAL_LIST_INDEX(p_s_sb)) { + flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ; + } + } + i = (i + 1) % JOURNAL_LIST_COUNT ; + count++ ; + } + /* now, check the current transaction. If there are no writers, and it is too old, finish it, and + ** force the commit blocks to disk + */ + if (!immediate && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 && + SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && + SB_JOURNAL(p_s_sb)->j_len > 0 && + (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > JOURNAL_MAX_TRANS_AGE) { + journal_join(&th, p_s_sb, 1) ; + reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; + journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; + do_journal_end(&th, p_s_sb,1, COMMIT_NOW) ; + keep_dirty = 0 ; + } else if (immediate) { /* belongs above, but I wanted this to be very explicit as a special case. If they say to + flush, we must be sure old transactions hit the disk too. */ + journal_join(&th, p_s_sb, 1) ; + reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; + journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; + do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ; + } + keep_dirty |= reiserfs_journal_kupdate(p_s_sb) ; + return keep_dirty ; +} + +/* +** returns 0 if do_journal_end should return right away, returns 1 if do_journal_end should finish the commit +** +** if the current transaction is too old, but still has writers, this will wait on j_join_wait until all +** the writers are done. By the time it wakes up, the transaction it was called has already ended, so it just +** flushes the commit list and returns 0. +** +** Won't batch when flush or commit_now is set. Also won't batch when others are waiting on j_join_wait. +** +** Note, we can't allow the journal_end to proceed while there are still writers in the log. +*/ +static int check_journal_end(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, + unsigned long nblocks, int flags) { + + time_t now ; + int flush = flags & FLUSH_ALL ; + int commit_now = flags & COMMIT_NOW ; + int wait_on_commit = flags & WAIT ; + + if (th->t_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) { + reiserfs_panic(th->t_super, "journal-1577: handle trans id %d != current trans id %d\n", + th->t_trans_id, SB_JOURNAL(p_s_sb)->j_trans_id); + } + + SB_JOURNAL(p_s_sb)->j_len_alloc -= (th->t_blocks_allocated - th->t_blocks_logged) ; + if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0) { /* <= 0 is allowed. unmounting might not call begin */ + atomic_dec(&(SB_JOURNAL(p_s_sb)->j_wcount)) ; + } + + /* BUG, deal with case where j_len is 0, but people previously freed blocks need to be released + ** will be dealt with by next transaction that actually writes something, but should be taken + ** care of in this trans + */ + if (SB_JOURNAL(p_s_sb)->j_len == 0) { + int wcount = atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) ; + unlock_journal(p_s_sb) ; + if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) > 0 && wcount <= 0) { + atomic_dec(&(SB_JOURNAL(p_s_sb)->j_jlock)) ; + wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ; + } + return 0 ; + } + /* if wcount > 0, and we are called to with flush or commit_now, + ** we wait on j_join_wait. We will wake up when the last writer has + ** finished the transaction, and started it on its way to the disk. + ** Then, we flush the commit or journal list, and just return 0 + ** because the rest of journal end was already done for this transaction. + */ + if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0) { + if (flush || commit_now) { + int orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ; + atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ; + if (flush) { + SB_JOURNAL(p_s_sb)->j_next_full_flush = 1 ; + } + unlock_journal(p_s_sb) ; + /* sleep while the current transaction is still j_jlocked */ + while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) && + SB_JOURNAL(p_s_sb)->j_trans_id == th->t_trans_id) { + sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ; + } + if (commit_now) { + if (wait_on_commit) { + flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ; + } else { + commit_flush_async(p_s_sb, orig_jindex) ; + } + } + return 0 ; + } + unlock_journal(p_s_sb) ; + return 0 ; + } + + /* deal with old transactions where we are the last writers */ + now = CURRENT_TIME ; + if ((now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > JOURNAL_MAX_TRANS_AGE) { + commit_now = 1 ; + SB_JOURNAL(p_s_sb)->j_next_async_flush = 1 ; + } + /* don't batch when someone is waiting on j_join_wait */ + /* don't batch when syncing the commit or flushing the whole trans */ + if (!(SB_JOURNAL(p_s_sb)->j_must_wait > 0) && !(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock))) && !flush && !commit_now && + (SB_JOURNAL(p_s_sb)->j_len < JOURNAL_MAX_BATCH) && + SB_JOURNAL(p_s_sb)->j_len_alloc < JOURNAL_MAX_BATCH && SB_JOURNAL(p_s_sb)->j_cnode_free > (JOURNAL_TRANS_MAX * 3)) { + SB_JOURNAL(p_s_sb)->j_bcount++ ; + unlock_journal(p_s_sb) ; + return 0 ; + } + + if (SB_JOURNAL(p_s_sb)->j_start > JOURNAL_BLOCK_COUNT) { + reiserfs_panic(p_s_sb, "journal-003: journal_end: j_start (%d) is too high\n", SB_JOURNAL(p_s_sb)->j_start) ; + } + return 1 ; +} + +/* +** Does all the work that makes deleting blocks safe. +** when deleting a block mark BH_JNew, just remove it from the current transaction, clean it's buffer_head and move on. +** +** otherwise: +** set a bit for the block in the journal bitmap. That will prevent it from being allocated for unformatted nodes +** before this transaction has finished. +** +** mark any cnodes for this block as BLOCK_FREED, and clear their bh pointers. That will prevent any old transactions with +** this block from trying to flush to the real location. Since we aren't removing the cnode from the journal_list_hash, +** the block can't be reallocated yet. +** +** Then remove it from the current transaction, decrementing any counters and filing it on the clean list. +*/ +int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long blocknr) { + struct reiserfs_journal_cnode *cn = NULL ; + struct buffer_head *bh = NULL ; + struct reiserfs_list_bitmap *jb = NULL ; + int cleaned = 0 ; + + if (reiserfs_dont_log(th->t_super)) { + bh = get_hash_table(p_s_sb->s_dev, blocknr, p_s_sb->s_blocksize) ; + if (bh && buffer_dirty (bh)) { + printk ("journal_mark_freed(dont_log): dirty buffer on hash list: %lx %ld\n", bh->b_state, blocknr); + BUG (); + } + brelse (bh); + return 0 ; + } + bh = get_hash_table(p_s_sb->s_dev, blocknr, p_s_sb->s_blocksize) ; + /* if it is journal new, we just remove it from this transaction */ + if (bh && buffer_journal_new(bh)) { + clear_prepared_bits(bh) ; + cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ; + } else { + /* set the bit for this block in the journal bitmap for this transaction */ + jb = SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap ; + if (!jb) { + reiserfs_panic(p_s_sb, "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n") ; + } + set_bit_in_list_bitmap(p_s_sb, blocknr, jb) ; + + /* Note, the entire while loop is not allowed to schedule. */ + + if (bh) { + clear_prepared_bits(bh) ; + } + cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ; + + /* find all older transactions with this block, make sure they don't try to write it out */ + cn = get_journal_hash_dev(SB_JOURNAL(p_s_sb)->j_list_hash_table, p_s_sb->s_dev, blocknr, p_s_sb->s_blocksize) ; + while (cn) { + if (p_s_sb->s_dev == cn->dev && blocknr == cn->blocknr) { + set_bit(BLOCK_FREED, &cn->state) ; + if (cn->bh) { + if (!cleaned) { + /* remove_from_transaction will brelse the buffer if it was + ** in the current trans + */ + mark_buffer_notjournal_dirty(cn->bh) ; + if (!buffer_locked(cn->bh)) { + reiserfs_clean_and_file_buffer(cn->bh) ; + } + cleaned = 1 ; + atomic_dec(&(cn->bh->b_count)) ; + if (atomic_read(&(cn->bh->b_count)) < 0) { + printk("journal-2138: cn->bh->b_count < 0\n") ; + } + } + if (cn->jlist) { /* since we are clearing the bh, we MUST dec nonzerolen */ + atomic_dec(&(cn->jlist->j_nonzerolen)) ; + } + cn->bh = NULL ; + } + } + cn = cn->hnext ; + } + } + + if (bh) { + atomic_dec(&(bh->b_count)) ; /* get_hash incs this */ + if (atomic_read(&(bh->b_count)) < 0) { + printk("journal-2165: bh->b_count < 0\n") ; + } + } + return 0 ; +} + +void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb, + struct buffer_head *bh) { + if (reiserfs_dont_log (p_s_sb)) + return; + + if (!bh) { + return ; + } + clear_bit(BH_JPrepared, &bh->b_state) ; +} + +extern struct tree_balance *cur_tb ; +/* +** before we can change a metadata block, we have to make sure it won't +** be written to disk while we are altering it. So, we must: +** clean it +** wait on it. +** +*/ +void reiserfs_prepare_for_journal(struct super_block *p_s_sb, + struct buffer_head *bh, int wait) { + int retry_count = 0 ; + + if (reiserfs_dont_log (p_s_sb)) + return; + + while(!test_bit(BH_JPrepared, &bh->b_state) || + (wait && buffer_locked(bh))) { + if (buffer_journaled(bh)) { + set_bit(BH_JPrepared, &bh->b_state) ; + return ; + } + set_bit(BH_JPrepared, &bh->b_state) ; + if (wait) { +#ifdef CONFIG_REISERFS_CHECK + if (buffer_locked(bh) && cur_tb != NULL) { + printk("reiserfs_prepare_for_journal, waiting while do_balance was running\n") ; + BUG() ; + } +#endif + wait_on_buffer(bh) ; + } + retry_count++ ; + } +} + +/* + * Wait for a page to get unlocked. + * + * This must be called with the caller "holding" the page, + * ie with increased "page->count" so that the page won't + * go away during the wait.. + */ +static void ___reiserfs_wait_on_page(struct reiserfs_page_list *pl) +{ + struct task_struct *tsk = current; + struct page *page = pl->page ; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue(&page->wait, &wait); + do { + block_sync_page(page); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!PageLocked(page) || pl->do_not_lock) + break; + schedule(); + } while (PageLocked(page)); + tsk->state = TASK_RUNNING; + remove_wait_queue(&page->wait, &wait); +} + +/* + * Get an exclusive lock on the page.. + * but, every time you get woken up, check the page to make sure + * someone hasn't called a journal_begin with it locked. + * + * the page should always be locked when this returns + * + * returns 0 if you've got the page locked + * returns 1 if it returns because someone else has called journal_begin + * with the page locked + * this is only useful to the code that flushes pages before a + * commit. Do not export this hack. Ever. + */ +static int reiserfs_try_lock_page(struct reiserfs_page_list *pl) +{ + struct page *page = pl->page ; + while (TryLockPage(page)) { + if (pl->do_not_lock) { + /* the page is locked, but we cannot have it */ + return 1 ; + } + ___reiserfs_wait_on_page(pl); + } + /* we have the page locked */ + return 0 ; +} + + +/* +** This can only be called from do_journal_end. +** it runs through the list things that need flushing before the +** transaction can commit, and writes each of them to disk +** +*/ + +static void flush_pages_before_commit(struct reiserfs_transaction_handle *th, + struct super_block *p_s_sb) { + struct reiserfs_page_list *pl = SB_JOURNAL(p_s_sb)->j_flush_pages ; + struct reiserfs_page_list *pl_tmp ; + struct buffer_head *bh, *head ; + int count = 0 ; + + /* first write each dirty unlocked buffer in the list */ + + while(pl) { + /* ugly. journal_end can be called from get_block, which has a + ** page locked. So, we have to check to see if pl->page is the page + ** currently locked by the calling function, and if so, skip the + ** lock + */ + if (reiserfs_try_lock_page(pl)) { + goto setup_next ; + } + if (!PageLocked(pl->page)) { + BUG() ; + } + if (pl->page->buffers) { + head = pl->page->buffers ; + bh = head ; + do { + if (bh->b_blocknr == pl->blocknr && buffer_dirty(bh) && + !buffer_locked(bh) && buffer_uptodate(bh) ) { + ll_rw_block(WRITE, 1, &bh) ; + } + bh = bh->b_this_page ; + } while (bh != head) ; + } + if (!pl->do_not_lock) { + UnlockPage(pl->page) ; + } +setup_next: + pl = pl->next ; + } + + /* now wait on them */ + + pl = SB_JOURNAL(p_s_sb)->j_flush_pages ; + while(pl) { + if (reiserfs_try_lock_page(pl)) { + goto remove_page ; + } + if (!PageLocked(pl->page)) { + BUG() ; + } + if (pl->page->buffers) { + head = pl->page->buffers ; + bh = head ; + do { + if (bh->b_blocknr == pl->blocknr) { + count++ ; + wait_on_buffer(bh) ; + if (!buffer_uptodate(bh)) { + reiserfs_panic(p_s_sb, "journal-2443: flush_pages_before_commit, error writing block %lu\n", bh->b_blocknr) ; + } + } + bh = bh->b_this_page ; + } while (bh != head) ; + } + if (!pl->do_not_lock) { + UnlockPage(pl->page) ; + } +remove_page: + /* we've waited on the I/O, we can remove the page from the + ** list, and free our pointer struct to it. + */ + if (pl->prev) { + pl->prev->next = pl->next ; + } + if (pl->next) { + pl->next->prev = pl->prev ; + } + put_page(pl->page) ; + pl_tmp = pl ; + pl = pl->next ; + reiserfs_kfree(pl_tmp, sizeof(struct reiserfs_page_list), p_s_sb) ; + } + SB_JOURNAL(p_s_sb)->j_flush_pages = NULL ; +} + +/* +** called when a indirect item is converted back into a tail. +** +** The reiserfs part of the inode stores enough information to find +** our page_list struct in the flush list. We remove it from the list +** and free the struct. +** +** Note, it is possible for this to happen: +** +** reiserfs_add_page_to_flush_list(inode) +** transaction ends, list is flushed +** reiserfs_remove_page_from_flush_list(inode) +** +** This would be bad because the page_list pointer in the inode is not +** updated when the list is flushed, so we can't know if the pointer is +** valid. So, in the inode, we also store the transaction id when the +** page was added. If we are trying to remove something from an old +** transaction, we just clear out the pointer in the inode and return. +** +** Normal case is to use the reiserfs_page_list pointer in the inode to +** find and remove the page from the flush list. +*/ +int reiserfs_remove_page_from_flush_list(struct reiserfs_transaction_handle *th, + struct inode *inode) { + struct reiserfs_page_list *pl ; + + /* was this conversion done in a previous transaction? If so, return */ + if (inode->u.reiserfs_i.i_conversion_trans_id < th->t_trans_id) { + inode->u.reiserfs_i.i_converted_page = NULL ; + inode->u.reiserfs_i.i_conversion_trans_id = 0 ; + return 0 ; + } + + /* remove the page_list struct from the list, release our hold on the + ** page, and free the page_list struct + */ + pl = inode->u.reiserfs_i.i_converted_page ; + if (pl) { + if (pl->next) { + pl->next->prev = pl->prev ; + } + if (pl->prev) { + pl->prev->next = pl->next ; + } + if (SB_JOURNAL(inode->i_sb)->j_flush_pages == pl) { + SB_JOURNAL(inode->i_sb)->j_flush_pages = pl->next ; + } + put_page(pl->page) ; + reiserfs_kfree(pl, sizeof(struct reiserfs_page_list), inode->i_sb) ; + inode->u.reiserfs_i.i_converted_page = NULL ; + inode->u.reiserfs_i.i_conversion_trans_id = 0 ; + } + return 0 ; +} + +/* +** Called after a direct to indirect transaction. The unformatted node +** must be flushed to disk before the transaction commits, otherwise, we +** risk losing the data from the direct item. This adds the page +** containing the unformatted node to a list of pages that need flushing. +** +** it calls get_page(page), so the page won't disappear until we've +** flushed or removed it from our list. +** +** pointers to the reiserfs_page_list struct are stored in the inode, +** so this page can be quickly removed from the list after the tail is +** converted back into a direct item. +** +** If we fail to find the memory for the reiserfs_page_list struct, we +** just sync the page now. Not good, but safe. +** +** since this must be called with the page locked, we always set +** the do_not_lock field in the page_list struct we allocate +** +*/ +int reiserfs_add_page_to_flush_list(struct reiserfs_transaction_handle *th, + struct inode *inode, + struct buffer_head *bh) { + struct reiserfs_page_list *new_pl ; + +/* debugging use ONLY. Do not define this on data you care about. */ +#ifdef REISERFS_NO_FLUSH_AFTER_CONVERT + return 0 ; +#endif + + get_page(bh->b_page) ; + new_pl = reiserfs_kmalloc(sizeof(struct reiserfs_page_list), GFP_BUFFER, + inode->i_sb) ; + if (!new_pl) { + put_page(bh->b_page) ; + reiserfs_warning("journal-2480: forced to flush page, out of memory\n") ; + ll_rw_block(WRITE, 1, &bh) ; + wait_on_buffer(bh) ; + if (!buffer_uptodate(bh)) { + reiserfs_panic(inode->i_sb, "journal-2484: error writing buffer %lu to disk\n", bh->b_blocknr) ; + } + inode->u.reiserfs_i.i_converted_page = NULL ; + return 0 ; + } + + new_pl->page = bh->b_page ; + new_pl->do_not_lock = 1 ; + new_pl->blocknr = bh->b_blocknr ; + new_pl->next = SB_JOURNAL(inode->i_sb)->j_flush_pages; + if (new_pl->next) { + new_pl->next->prev = new_pl ; + } + new_pl->prev = NULL ; + SB_JOURNAL(inode->i_sb)->j_flush_pages = new_pl ; + + /* if we have numbers from an old transaction, zero the converted + ** page, it has already been flushed and freed + */ + if (inode->u.reiserfs_i.i_conversion_trans_id && + inode->u.reiserfs_i.i_conversion_trans_id < th->t_trans_id) { + inode->u.reiserfs_i.i_converted_page = NULL ; + } + if (inode->u.reiserfs_i.i_converted_page) { + reiserfs_panic(inode->i_sb, "journal-2501: inode already had a converted page\n") ; + } + inode->u.reiserfs_i.i_converted_page = new_pl ; + inode->u.reiserfs_i.i_conversion_trans_id = th->t_trans_id ; + return 0 ; +} + +/* +** long and ugly. If flush, will not return until all commit +** blocks and all real buffers in the trans are on disk. +** If no_async, won't return until all commit blocks are on disk. +** +** keep reading, there are comments as you go along +*/ +static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks, + int flags) { + struct reiserfs_journal_cnode *cn, *next, *jl_cn; + struct reiserfs_journal_cnode *last_cn = NULL; + struct reiserfs_journal_desc *desc ; + struct reiserfs_journal_commit *commit ; + struct buffer_head *c_bh ; /* commit bh */ + struct buffer_head *d_bh ; /* desc bh */ + int cur_write_start = 0 ; /* start index of current log write */ + int cur_blocks_left = 0 ; /* number of journal blocks left to write */ + int old_start ; + int i ; + int jindex ; + int orig_jindex ; + int flush = flags & FLUSH_ALL ; + int commit_now = flags & COMMIT_NOW ; + int wait_on_commit = flags & WAIT ; + struct reiserfs_super_block *rs ; + + if (reiserfs_dont_log(th->t_super)) { + return 0 ; + } + + lock_journal(p_s_sb) ; + if (SB_JOURNAL(p_s_sb)->j_next_full_flush) { + flags |= FLUSH_ALL ; + flush = 1 ; + } + if (SB_JOURNAL(p_s_sb)->j_next_async_flush) { + flags |= COMMIT_NOW ; + commit_now = 1 ; + } + + /* check_journal_end locks the journal, and unlocks if it does not return 1 + ** it tells us if we should continue with the journal_end, or just return + */ + if (!check_journal_end(th, p_s_sb, nblocks, flags)) { + return 0 ; + } + + /* check_journal_end might set these, check again */ + if (SB_JOURNAL(p_s_sb)->j_next_full_flush) { + flush = 1 ; + } + if (SB_JOURNAL(p_s_sb)->j_next_async_flush) { + commit_now = 1 ; + } + /* + ** j must wait means we have to flush the log blocks, and the real blocks for + ** this transaction + */ + if (SB_JOURNAL(p_s_sb)->j_must_wait > 0) { + flush = 1 ; + } + + rs = SB_DISK_SUPER_BLOCK(p_s_sb) ; + /* setup description block */ + d_bh = getblk(p_s_sb->s_dev, reiserfs_get_journal_block(p_s_sb) + SB_JOURNAL(p_s_sb)->j_start, p_s_sb->s_blocksize) ; + mark_buffer_uptodate(d_bh, 1) ; + desc = (struct reiserfs_journal_desc *)(d_bh)->b_data ; + memset(desc, 0, sizeof(struct reiserfs_journal_desc)) ; + memcpy(desc->j_magic, JOURNAL_DESC_MAGIC, 8) ; + desc->j_trans_id = cpu_to_le32(SB_JOURNAL(p_s_sb)->j_trans_id) ; + + /* setup commit block. Don't write (keep it clean too) this one until after everyone else is written */ + c_bh = getblk(p_s_sb->s_dev, reiserfs_get_journal_block(p_s_sb) + + ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL(p_s_sb)->j_len + 1) % JOURNAL_BLOCK_COUNT), + p_s_sb->s_blocksize) ; + commit = (struct reiserfs_journal_commit *)c_bh->b_data ; + memset(commit, 0, sizeof(struct reiserfs_journal_commit)) ; + commit->j_trans_id = cpu_to_le32(SB_JOURNAL(p_s_sb)->j_trans_id) ; + mark_buffer_uptodate(c_bh, 1) ; + + /* init this journal list */ + atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_older_commits_done), 0) ; + SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ; + SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ; + SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_bh = c_bh ; + SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_start = SB_JOURNAL(p_s_sb)->j_start ; + SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len = SB_JOURNAL(p_s_sb)->j_len ; + atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_nonzerolen), SB_JOURNAL(p_s_sb)->j_len) ; + atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_left), SB_JOURNAL(p_s_sb)->j_len + 2); + SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = NULL ; + atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ; + atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ; + + /* which is faster, locking/unlocking at the start and end of the for + ** or locking once per iteration around the insert_journal_hash? + ** eitherway, we are write locking insert_journal_hash. The ENTIRE FOR + ** LOOP MUST not cause schedule to occur. + */ + + /* for each real block, add it to the journal list hash, + ** copy into real block index array in the commit or desc block + */ + for (i = 0, cn = SB_JOURNAL(p_s_sb)->j_first ; cn ; cn = cn->next, i++) { + if (test_bit(BH_JDirty, &cn->bh->b_state) ) { + jl_cn = get_cnode(p_s_sb) ; + if (!jl_cn) { + reiserfs_panic(p_s_sb, "journal-1676, get_cnode returned NULL\n") ; + } + if (i == 0) { + SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = jl_cn ; + } + jl_cn->prev = last_cn ; + jl_cn->next = NULL ; + if (last_cn) { + last_cn->next = jl_cn ; + } + last_cn = jl_cn ; + if (cn->bh->b_blocknr >= reiserfs_get_journal_block(p_s_sb) && + cn->bh->b_blocknr < (reiserfs_get_journal_block(p_s_sb) + JOURNAL_BLOCK_COUNT)) { + reiserfs_panic(p_s_sb, "journal-2332: Trying to log block %lu, which is a log block\n", cn->bh->b_blocknr) ; + } + jl_cn->blocknr = cn->bh->b_blocknr ; + jl_cn->state = 0 ; + jl_cn->dev = cn->bh->b_dev ; + jl_cn->bh = cn->bh ; + jl_cn->jlist = SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb) ; + insert_journal_hash(SB_JOURNAL(p_s_sb)->j_list_hash_table, jl_cn) ; + if (i < JOURNAL_TRANS_HALF) { + desc->j_realblock[i] = cpu_to_le32(cn->bh->b_blocknr) ; + } else { + commit->j_realblock[i - JOURNAL_TRANS_HALF] = cpu_to_le32(cn->bh->b_blocknr) ; + } + } else { + i-- ; + } + } + + desc->j_len = cpu_to_le32(SB_JOURNAL(p_s_sb)->j_len) ; + desc->j_mount_id = cpu_to_le32(SB_JOURNAL(p_s_sb)->j_mount_id) ; + desc->j_trans_id = cpu_to_le32(SB_JOURNAL(p_s_sb)->j_trans_id) ; + commit->j_len = cpu_to_le32(SB_JOURNAL(p_s_sb)->j_len) ; + + /* special check in case all buffers in the journal were marked for not logging */ + if (SB_JOURNAL(p_s_sb)->j_len == 0) { + brelse(d_bh) ; + brelse(c_bh) ; + unlock_journal(p_s_sb) ; +printk("journal-2020: do_journal_end: BAD desc->j_len is ZERO\n") ; + atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ; + wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ; + return 0 ; + } + + /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */ + cur_write_start = SB_JOURNAL(p_s_sb)->j_start ; + cur_blocks_left = SB_JOURNAL(p_s_sb)->j_len ; + cn = SB_JOURNAL(p_s_sb)->j_first ; + jindex = 1 ; /* start at one so we don't get the desc again */ + while(cur_blocks_left > 0) { + /* copy all the real blocks into log area. dirty log blocks */ + if (test_bit(BH_JDirty, &cn->bh->b_state)) { + struct buffer_head *tmp_bh ; + tmp_bh = getblk(p_s_sb->s_dev, reiserfs_get_journal_block(p_s_sb) + + ((cur_write_start + jindex) % JOURNAL_BLOCK_COUNT), + p_s_sb->s_blocksize) ; + mark_buffer_uptodate(tmp_bh, 1) ; + memcpy(tmp_bh->b_data, cn->bh->b_data, cn->bh->b_size) ; + jindex++ ; + } else { + /* JDirty cleared sometime during transaction. don't log this one */ + printk("journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!\n") ; + } + cn = cn->next ; + cur_blocks_left-- ; + } + + /* we are done with both the c_bh and d_bh, but + ** c_bh must be written after all other commit blocks, + ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. + */ + + /* now loop through and mark all buffers from this transaction as JDirty_wait + ** clear the JDirty bit, clear BH_JNew too. + ** if they weren't JDirty, they weren't logged, just relse them and move on + */ + cn = SB_JOURNAL(p_s_sb)->j_first ; + while(cn) { + clear_bit(BH_JNew, &(cn->bh->b_state)) ; + if (test_bit(BH_JDirty, &(cn->bh->b_state))) { + set_bit(BH_JDirty_wait, &(cn->bh->b_state)) ; + clear_bit(BH_JDirty, &(cn->bh->b_state)) ; + } else { + brelse(cn->bh) ; + } + next = cn->next ; + free_cnode(p_s_sb, cn) ; + cn = next ; + } + + /* unlock the journal list for committing and flushing */ + atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 0) ; + atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 0) ; + + orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ; + jindex = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ; + SB_JOURNAL_LIST_INDEX(p_s_sb) = jindex ; + + /* make sure to flush any data converted from direct items to + ** indirect items before allowing the commit blocks to reach the + ** disk + */ + flush_pages_before_commit(th, p_s_sb) ; + + /* honor the flush and async wishes from the caller */ + if (flush) { + + flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ; + flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex , 1) ; + } else if (commit_now) { + if (wait_on_commit) { + flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ; + } else { + commit_flush_async(p_s_sb, orig_jindex) ; + } + } + + /* reset journal values for the next transaction */ + old_start = SB_JOURNAL(p_s_sb)->j_start ; + SB_JOURNAL(p_s_sb)->j_start = (SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL(p_s_sb)->j_len + 2) % JOURNAL_BLOCK_COUNT; + atomic_set(&(SB_JOURNAL(p_s_sb)->j_wcount), 0) ; + SB_JOURNAL(p_s_sb)->j_bcount = 0 ; + SB_JOURNAL(p_s_sb)->j_last = NULL ; + SB_JOURNAL(p_s_sb)->j_first = NULL ; + SB_JOURNAL(p_s_sb)->j_len = 0 ; + SB_JOURNAL(p_s_sb)->j_trans_start_time = 0 ; + SB_JOURNAL(p_s_sb)->j_trans_id++ ; + SB_JOURNAL(p_s_sb)->j_must_wait = 0 ; + SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ; + SB_JOURNAL(p_s_sb)->j_next_full_flush = 0 ; + SB_JOURNAL(p_s_sb)->j_next_async_flush = 0 ; + init_journal_hash(p_s_sb) ; + + /* if the next transaction has any chance of wrapping, flush + ** transactions that might get overwritten. If any journal lists are very + ** old flush them as well. + */ + for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) { + jindex = i ; + if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && SB_JOURNAL(p_s_sb)->j_start <= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) { + if ((SB_JOURNAL(p_s_sb)->j_start + JOURNAL_TRANS_MAX + 1) >= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) { + flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ; + } + } else if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && + (SB_JOURNAL(p_s_sb)->j_start + JOURNAL_TRANS_MAX + 1) > JOURNAL_BLOCK_COUNT) { + if (((SB_JOURNAL(p_s_sb)->j_start + JOURNAL_TRANS_MAX + 1) % JOURNAL_BLOCK_COUNT) >= + SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) { + flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ; + } + } + /* this check should always be run, to send old lists to disk */ + if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && + SB_JOURNAL_LIST(p_s_sb)[jindex].j_timestamp < + (CURRENT_TIME - (JOURNAL_MAX_TRANS_AGE * 4))) { + flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ; + } + } + + /* if the next journal_list is still in use, flush it */ + if (SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len != 0) { + flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb), 1) ; + } + + /* we don't want anyone flushing the new transaction's list */ + atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ; + atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ; + SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + + SB_JOURNAL_LIST_INDEX(p_s_sb)) ; + + if (!(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap)) { + reiserfs_panic(p_s_sb, "journal-1996: do_journal_end, could not get a list bitmap\n") ; + } + unlock_journal(p_s_sb) ; + atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ; + /* wake up any body waiting to join. */ + wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ; + return 0 ; +} + + + diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/lbalance.c linux/fs/reiserfs/lbalance.c --- v2.4.0/linux/fs/reiserfs/lbalance.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/lbalance.c Mon Jan 15 15:31:19 2001 @@ -0,0 +1,1326 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include + +#else + +#include "nokernel.h" + +#endif + +/* these are used in do_balance.c */ + +/* leaf_move_items + leaf_shift_left + leaf_shift_right + leaf_delete_items + leaf_insert_into_buf + leaf_paste_in_buffer + leaf_cut_from_buffer + leaf_paste_entries + */ + + +/* copy copy_count entries from source directory item to dest buffer (creating new item if needed) */ +static void leaf_copy_dir_entries (struct buffer_info * dest_bi, struct buffer_head * source, + int last_first, int item_num, int from, int copy_count) +{ + struct buffer_head * dest = dest_bi->bi_bh; + int item_num_in_dest; /* either the number of target item, + or if we must create a new item, + the number of the item we will + create it next to */ + struct item_head * ih; + struct reiserfs_de_head * deh; + int copy_records_len; /* length of all records in item to be copied */ + char * records; + + ih = B_N_PITEM_HEAD (source, item_num); + +#ifdef CONFIG_REISERFS_CHECK + if (!is_direntry_le_ih (ih)) + reiserfs_panic(0, "vs-10000: leaf_copy_dir_entries: item must be directory item"); +#endif + + /* length of all record to be copied and first byte of the last of them */ + deh = B_I_DEH (source, ih); + if (copy_count) { + copy_records_len = (from ? deh[from - 1].deh_location : ih->ih_item_len) - + deh[from + copy_count - 1].deh_location; + records = source->b_data + ih->ih_item_location + deh[from + copy_count - 1].deh_location; + } else { + copy_records_len = 0; + records = 0; + } + + /* when copy last to first, dest buffer can contain 0 items */ + item_num_in_dest = (last_first == LAST_TO_FIRST) ? (( B_NR_ITEMS(dest) ) ? 0 : -1) : (B_NR_ITEMS(dest) - 1); + + /* if there are no items in dest or the first/last item in dest is not item of the same directory */ + if ( (item_num_in_dest == - 1) || +#ifdef REISERFS_FSCK + (last_first == FIRST_TO_LAST && are_items_mergeable (B_N_PITEM_HEAD (dest, item_num_in_dest), ih, dest->b_size) == 0) || + (last_first == LAST_TO_FIRST && are_items_mergeable (ih, B_N_PITEM_HEAD (dest, item_num_in_dest), dest->b_size) == 0)) { +#else + (last_first == FIRST_TO_LAST && le_key_k_offset (ih_version (ih), &(ih->ih_key)) == DOT_OFFSET) || + (last_first == LAST_TO_FIRST && comp_short_le_keys/*COMP_SHORT_KEYS*/ (&ih->ih_key, B_N_PKEY (dest, item_num_in_dest)))) { +#endif + /* create new item in dest */ + struct item_head new_ih; + + /* form item header */ + memcpy (&new_ih.ih_key, &ih->ih_key, KEY_SIZE); + new_ih.ih_version = cpu_to_le16 (ITEM_VERSION_1); + /* calculate item len */ + new_ih.ih_item_len = cpu_to_le16 (DEH_SIZE * copy_count + copy_records_len); + I_ENTRY_COUNT(&new_ih) = 0; + + if (last_first == LAST_TO_FIRST) { + /* form key by the following way */ + if (from < I_ENTRY_COUNT(ih)) { + set_le_ih_k_offset (&new_ih, cpu_to_le32 (le32_to_cpu (deh[from].deh_offset))); + /*memcpy (&new_ih.ih_key.k_offset, &deh[from].deh_offset, SHORT_KEY_SIZE);*/ + } else { + /* no entries will be copied to this item in this function */ + set_le_ih_k_offset (&new_ih, cpu_to_le32 (U32_MAX)); + /* this item is not yet valid, but we want I_IS_DIRECTORY_ITEM to return 1 for it, so we -1 */ + } + set_le_key_k_type (ITEM_VERSION_1, &(new_ih.ih_key), TYPE_DIRENTRY); + } + + /* insert item into dest buffer */ + leaf_insert_into_buf (dest_bi, (last_first == LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest), &new_ih, NULL, 0); + } else { + /* prepare space for entries */ + leaf_paste_in_buffer (dest_bi, (last_first==FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0, MAX_US_INT, + DEH_SIZE * copy_count + copy_records_len, records, 0 + ); + } + + item_num_in_dest = (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest)-1) : 0; + + leaf_paste_entries (dest_bi->bi_bh, item_num_in_dest, + (last_first == FIRST_TO_LAST) ? I_ENTRY_COUNT(B_N_PITEM_HEAD (dest, item_num_in_dest)) : 0, + copy_count, deh + from, records, + DEH_SIZE * copy_count + copy_records_len + ); +} + + +/* Copy the first (if last_first == FIRST_TO_LAST) or last (last_first == LAST_TO_FIRST) item or + part of it or nothing (see the return 0 below) from SOURCE to the end + (if last_first) or beginning (!last_first) of the DEST */ +/* returns 1 if anything was copied, else 0 */ +static int leaf_copy_boundary_item (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, + int bytes_or_entries) +{ + struct buffer_head * dest = dest_bi->bi_bh; + int dest_nr_item, src_nr_item; /* number of items in the source and destination buffers */ + struct item_head * ih; + struct item_head * dih; + + dest_nr_item = B_NR_ITEMS(dest); + + if ( last_first == FIRST_TO_LAST ) { + /* if ( DEST is empty or first item of SOURCE and last item of DEST are the items of different objects + or of different types ) then there is no need to treat this item differently from the other items + that we copy, so we return */ + ih = B_N_PITEM_HEAD (src, 0); + dih = B_N_PITEM_HEAD (dest, dest_nr_item - 1); +#ifdef REISERFS_FSCK + if (!dest_nr_item || (are_items_mergeable (dih, ih, src->b_size) == 0)) +#else + if (!dest_nr_item || (!op_is_left_mergeable (&(ih->ih_key), src->b_size))) +#endif + /* there is nothing to merge */ + return 0; + +#ifdef CONFIG_REISERFS_CHECK + if ( ! ih->ih_item_len ) + reiserfs_panic (0, "vs-10010: leaf_copy_boundary_item: item can not have empty dynamic length"); +#endif + + if ( is_direntry_le_ih (ih) ) { + if ( bytes_or_entries == -1 ) + /* copy all entries to dest */ + bytes_or_entries = le16_to_cpu (ih->u.ih_entry_count); + leaf_copy_dir_entries (dest_bi, src, FIRST_TO_LAST, 0, 0, bytes_or_entries); + return 1; + } + + /* copy part of the body of the first item of SOURCE to the end of the body of the last item of the DEST + part defined by 'bytes_or_entries'; if bytes_or_entries == -1 copy whole body; don't create new item header + */ + if ( bytes_or_entries == -1 ) + bytes_or_entries = le16_to_cpu (ih->ih_item_len); + +#ifdef CONFIG_REISERFS_CHECK + else { + if (bytes_or_entries == le16_to_cpu (ih->ih_item_len) && is_indirect_le_ih(ih)) + if (get_ih_free_space (ih)) + reiserfs_panic (0, "vs-10020: leaf_copy_boundary_item: " + "last unformatted node must be filled entirely (%h)", + ih); + } +#endif + + /* merge first item (or its part) of src buffer with the last + item of dest buffer. Both are of the same file */ + leaf_paste_in_buffer (dest_bi, + dest_nr_item - 1, dih->ih_item_len, bytes_or_entries, B_I_PITEM(src,ih), 0 + ); + + if (is_indirect_le_ih (dih)) { +#ifdef CONFIG_REISERFS_CHECK + if (get_ih_free_space (dih)) + reiserfs_panic (0, "vs-10030: leaf_copy_boundary_item: " + "merge to left: last unformatted node of non-last indirect item %h must have zerto free space", + ih); +#endif + if (bytes_or_entries == le16_to_cpu (ih->ih_item_len)) + set_ih_free_space (dih, get_ih_free_space (ih)); + } + + return 1; + } + + + /* copy boundary item to right (last_first == LAST_TO_FIRST) */ + + /* ( DEST is empty or last item of SOURCE and first item of DEST + are the items of different object or of different types ) + */ + src_nr_item = B_NR_ITEMS (src); + ih = B_N_PITEM_HEAD (src, src_nr_item - 1); + dih = B_N_PITEM_HEAD (dest, 0); + +#ifdef REISERFS_FSCK + if (!dest_nr_item || are_items_mergeable (ih, dih, src->b_size) == 0) +#else + if (!dest_nr_item || !op_is_left_mergeable (&(dih->ih_key), src->b_size)) +#endif + return 0; + + if ( is_direntry_le_ih (ih)) { + if ( bytes_or_entries == -1 ) + /* bytes_or_entries = entries number in last item body of SOURCE */ + bytes_or_entries = le16_to_cpu (ih->u.ih_entry_count); + + leaf_copy_dir_entries (dest_bi, src, LAST_TO_FIRST, src_nr_item - 1, le16_to_cpu (ih->u.ih_entry_count) - bytes_or_entries, bytes_or_entries); + return 1; + } + + /* copy part of the body of the last item of SOURCE to the begin of the body of the first item of the DEST; + part defined by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; change first item key of the DEST; + don't create new item header + */ + +#ifdef CONFIG_REISERFS_CHECK + if (is_indirect_le_ih(ih) && get_ih_free_space (ih)) + reiserfs_panic (0, "vs-10040: leaf_copy_boundary_item: " + "merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)", + ih); +#endif + + if ( bytes_or_entries == -1 ) { + /* bytes_or_entries = length of last item body of SOURCE */ + bytes_or_entries = ih->ih_item_len; + +#ifdef CONFIG_REISERFS_CHECK + if (le_ih_k_offset (dih) != le_ih_k_offset (ih) + op_bytes_number (ih, src->b_size)) + reiserfs_panic (0, "vs-10050: leaf_copy_boundary_item: items %h and %h do not match", ih, dih); +#endif + + /* change first item key of the DEST */ + set_le_ih_k_offset (dih, le_ih_k_offset (ih)); + + /* item becomes non-mergeable */ + /* or mergeable if left item was */ + set_le_ih_k_type (dih, le_ih_k_type (ih)); + } else { + /* merge to right only part of item */ +#ifdef CONFIG_REISERFS_CHECK + if ( le16_to_cpu (ih->ih_item_len) <= bytes_or_entries ) + reiserfs_panic (0, "vs-10060: leaf_copy_boundary_item: no so much bytes %lu (needed %lu)", + ih->ih_item_len, bytes_or_entries); +#endif + + /* change first item key of the DEST */ + if ( is_direct_le_ih (dih) ) { +#ifdef CONFIG_REISERFS_CHECK + if (le_ih_k_offset (dih) <= (unsigned long)bytes_or_entries) + reiserfs_panic (0, "vs-10070: leaf_copy_boundary_item: dih %h, bytes_or_entries(%d)", + dih, bytes_or_entries); +#endif + set_le_ih_k_offset (dih, le_ih_k_offset (dih) - bytes_or_entries); + } else { +#ifdef CONFIG_REISERFS_CHECK + if (le_ih_k_offset (dih) <= (bytes_or_entries / UNFM_P_SIZE) * dest->b_size ) + reiserfs_panic (0, "vs-10080: leaf_copy_boundary_item: dih %h, bytes_or_entries(%d)", + dih, (bytes_or_entries/UNFM_P_SIZE)*dest->b_size); +#endif + set_le_ih_k_offset (dih, le_ih_k_offset (dih) - ((bytes_or_entries / UNFM_P_SIZE) * dest->b_size)); + } + } + + leaf_paste_in_buffer (dest_bi, 0, 0, bytes_or_entries, B_I_PITEM(src,ih) + ih->ih_item_len - bytes_or_entries, 0); + return 1; +} + + +/* copy cpy_mun items from buffer src to buffer dest + * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning from first-th item in src to tail of dest + * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning from first-th item in src to head of dest + */ +static void leaf_copy_items_entirely (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, + int first, int cpy_num) +{ + struct buffer_head * dest; + int nr; + int dest_before; + int last_loc, last_inserted_loc, location; + int i, j; + struct block_head * blkh; + struct item_head * ih; + +#ifdef CONFIG_REISERFS_CHECK + if (last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST) + reiserfs_panic (0, "vs-10090: leaf_copy_items_entirely: bad last_first parameter %d", last_first); + + if (B_NR_ITEMS (src) - first < cpy_num) + reiserfs_panic (0, "vs-10100: leaf_copy_items_entirely: too few items in source %d, required %d from %d", + B_NR_ITEMS(src), cpy_num, first); + + if (cpy_num < 0) + reiserfs_panic (0, "vs-10110: leaf_copy_items_entirely: can not copy negative amount of items"); + + if ( ! dest_bi ) + reiserfs_panic (0, "vs-10120: leaf_copy_items_entirely: can not copy negative amount of items"); +#endif + + dest = dest_bi->bi_bh; + +#ifdef CONFIG_REISERFS_CHECK + if ( ! dest ) + reiserfs_panic (0, "vs-10130: leaf_copy_items_entirely: can not copy negative amount of items"); +#endif + + if (cpy_num == 0) + return; + + nr = le16_to_cpu ((blkh = B_BLK_HEAD(dest))->blk_nr_item); + + /* we will insert items before 0-th or nr-th item in dest buffer. It depends of last_first parameter */ + dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr; + + /* location of head of first new item */ + ih = B_N_PITEM_HEAD (dest, dest_before); + +#ifdef CONFIG_REISERFS_CHECK + if (le16_to_cpu (blkh->blk_free_space) < cpy_num * IH_SIZE) { + reiserfs_panic (0, "vs-10140: leaf_copy_items_entirely: " + "not enough free space for headers %d (needed %d)", + B_FREE_SPACE (dest), cpy_num * IH_SIZE); + } +#endif + + /* prepare space for headers */ + memmove (ih + cpy_num, ih, (nr-dest_before) * IH_SIZE); + + /* copy item headers */ + memcpy (ih, B_N_PITEM_HEAD (src, first), cpy_num * IH_SIZE); + + blkh->blk_free_space = cpu_to_le16 (le16_to_cpu (blkh->blk_free_space) - IH_SIZE * cpy_num); + + /* location of unmovable item */ + j = location = (dest_before == 0) ? dest->b_size : (ih-1)->ih_item_location; + for (i = dest_before; i < nr + cpy_num; i ++) + ih[i-dest_before].ih_item_location = + (location -= ih[i-dest_before].ih_item_len); + + /* prepare space for items */ + last_loc = ih[nr+cpy_num-1-dest_before].ih_item_location; + last_inserted_loc = ih[cpy_num-1].ih_item_location; + + /* check free space */ +#ifdef CONFIG_REISERFS_CHECK + if (le16_to_cpu (blkh->blk_free_space) < j - last_inserted_loc) { + reiserfs_panic (0, "vs-10150: leaf_copy_items_entirely: not enough free space for items %d (needed %d)", + le16_to_cpu (blkh->blk_free_space), j - last_inserted_loc); + } +#endif + + memmove (dest->b_data + last_loc, + dest->b_data + last_loc + j - last_inserted_loc, + last_inserted_loc - last_loc); + + /* copy items */ + memcpy (dest->b_data + last_inserted_loc, B_N_PITEM(src,(first + cpy_num - 1)), + j - last_inserted_loc); + + /* sizes, item number */ + blkh->blk_nr_item = cpu_to_le16 (le16_to_cpu (blkh->blk_nr_item) + cpy_num); + blkh->blk_free_space = cpu_to_le16 (le16_to_cpu (blkh->blk_free_space) - (j - last_inserted_loc)); + + do_balance_mark_leaf_dirty (dest_bi->tb, dest, 0); + + if (dest_bi->bi_parent) { +#ifdef CONFIG_REISERFS_CHECK + if (B_N_CHILD (dest_bi->bi_parent, dest_bi->bi_position)->dc_block_number != dest->b_blocknr) { + reiserfs_panic (0, "vs-10160: leaf_copy_items_entirely: " + "block number in bh does not match to field in disk_child structure %lu and %lu", + dest->b_blocknr, B_N_CHILD (dest_bi->bi_parent, dest_bi->bi_position)->dc_block_number); + } +#endif + B_N_CHILD (dest_bi->bi_parent, dest_bi->bi_position)->dc_size += + j - last_inserted_loc + IH_SIZE * cpy_num; + + do_balance_mark_internal_dirty (dest_bi->tb, dest_bi->bi_parent, 0); + } +} + + +/* This function splits the (liquid) item into two items (useful when + shifting part of an item into another node.) */ +static void leaf_item_bottle (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, + int item_num, int cpy_bytes) +{ + struct buffer_head * dest = dest_bi->bi_bh; + struct item_head * ih; + +#ifdef CONFIG_REISERFS_CHECK + if ( cpy_bytes == -1 ) + reiserfs_panic (0, "vs-10170: leaf_item_bottle: bytes == - 1 means: do not split item"); +#endif + + if ( last_first == FIRST_TO_LAST ) { + /* if ( if item in position item_num in buffer SOURCE is directory item ) */ + if (is_direntry_le_ih (ih = B_N_PITEM_HEAD(src,item_num))) + leaf_copy_dir_entries (dest_bi, src, FIRST_TO_LAST, item_num, 0, cpy_bytes); + else { + struct item_head n_ih; + + /* copy part of the body of the item number 'item_num' of SOURCE to the end of the DEST + part defined by 'cpy_bytes'; create new item header; change old item_header (????); + n_ih = new item_header; + */ + memcpy (&n_ih, ih, IH_SIZE); + n_ih.ih_item_len = cpu_to_le16 (cpy_bytes); + if (is_indirect_le_ih (ih)) { +#ifdef CONFIG_REISERFS_CHECK + if (cpy_bytes == le16_to_cpu (ih->ih_item_len) && get_ih_free_space (ih)) + reiserfs_panic (0, "vs-10180: leaf_item_bottle: " + "when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)", + get_ih_free_space (ih)); +#endif + set_ih_free_space (&n_ih, 0); + } + +#ifdef CONFIG_REISERFS_CHECK + if (op_is_left_mergeable (&(ih->ih_key), src->b_size)) + reiserfs_panic (0, "vs-10190: leaf_item_bottle: bad mergeability of item %h", ih); +#endif + n_ih.ih_version = ih->ih_version;; + leaf_insert_into_buf (dest_bi, B_NR_ITEMS(dest), &n_ih, B_N_PITEM (src, item_num), 0); + } + } else { + /* if ( if item in position item_num in buffer SOURCE is directory item ) */ + if (is_direntry_le_ih(ih = B_N_PITEM_HEAD (src, item_num))) + leaf_copy_dir_entries (dest_bi, src, LAST_TO_FIRST, item_num, I_ENTRY_COUNT(ih) - cpy_bytes, cpy_bytes); + else { + struct item_head n_ih; + + /* copy part of the body of the item number 'item_num' of SOURCE to the begin of the DEST + part defined by 'cpy_bytes'; create new item header; + n_ih = new item_header; + */ + memcpy (&n_ih, ih, SHORT_KEY_SIZE); + n_ih.ih_version = cpu_to_le16 (ih_version (ih)); + if (is_direct_le_ih (ih)) { + set_le_ih_k_offset (&n_ih, le_ih_k_offset (ih) + le16_to_cpu (ih->ih_item_len) - cpy_bytes); + set_le_ih_k_type (&n_ih, TYPE_DIRECT); + set_ih_free_space (&n_ih, MAX_US_INT); + } else { + /* indirect item */ +#ifdef CONFIG_REISERFS_CHECK + if (!cpy_bytes && get_ih_free_space (ih)) + reiserfs_panic (0, "vs-10200: leaf_item_bottle: ih->ih_free_space must be 0 when indirect item will be appended"); +#endif + set_le_ih_k_offset (&n_ih, le_ih_k_offset (ih) + (le16_to_cpu (ih->ih_item_len) - cpy_bytes) / UNFM_P_SIZE * dest->b_size); + set_le_ih_k_type (&n_ih, TYPE_INDIRECT); + set_ih_free_space (&n_ih, get_ih_free_space (ih)); + } + + /* set item length */ + n_ih.ih_item_len = cpu_to_le16 (cpy_bytes); + n_ih.ih_version = cpu_to_le16 (le16_to_cpu (ih->ih_version)); + leaf_insert_into_buf (dest_bi, 0, &n_ih, B_N_PITEM(src,item_num) + le16_to_cpu (ih->ih_item_len) - cpy_bytes, 0); + } + } +} + + +/* If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE to DEST. + If cpy_bytes not equal to minus one than copy cpy_num-1 whole items from SOURCE to DEST. + From last item copy cpy_num bytes for regular item and cpy_num directory entries for + directory item. */ +static int leaf_copy_items (struct buffer_info * dest_bi, struct buffer_head * src, int last_first, int cpy_num, + int cpy_bytes) +{ + struct buffer_head * dest; + int pos, i, src_nr_item, bytes; + + dest = dest_bi->bi_bh; +#ifdef CONFIG_REISERFS_CHECK + if (!dest || !src) + reiserfs_panic (0, "vs-10210: leaf_copy_items: !dest || !src"); + + if ( last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST ) + reiserfs_panic (0, "vs-10220: leaf_copy_items: last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST"); + + if ( B_NR_ITEMS(src) < cpy_num ) + reiserfs_panic (0, "vs-10230: leaf_copy_items: No enough items: %d, required %d", B_NR_ITEMS(src), cpy_num); + + if ( cpy_num < 0 ) + reiserfs_panic (0, "vs-10240: leaf_copy_items: cpy_num < 0 (%d)", cpy_num); +#endif + + if ( cpy_num == 0 ) + return 0; + + if ( last_first == FIRST_TO_LAST ) { + /* copy items to left */ + pos = 0; + if ( cpy_num == 1 ) + bytes = cpy_bytes; + else + bytes = -1; + + /* copy the first item or it part or nothing to the end of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) */ + i = leaf_copy_boundary_item (dest_bi, src, FIRST_TO_LAST, bytes); + cpy_num -= i; + if ( cpy_num == 0 ) + return i; + pos += i; + if ( cpy_bytes == -1 ) + /* copy first cpy_num items starting from position 'pos' of SOURCE to end of DEST */ + leaf_copy_items_entirely (dest_bi, src, FIRST_TO_LAST, pos, cpy_num); + else { + /* copy first cpy_num-1 items starting from position 'pos-1' of the SOURCE to the end of the DEST */ + leaf_copy_items_entirely (dest_bi, src, FIRST_TO_LAST, pos, cpy_num-1); + + /* copy part of the item which number is cpy_num+pos-1 to the end of the DEST */ + leaf_item_bottle (dest_bi, src, FIRST_TO_LAST, cpy_num+pos-1, cpy_bytes); + } + } else { + /* copy items to right */ + src_nr_item = B_NR_ITEMS (src); + if ( cpy_num == 1 ) + bytes = cpy_bytes; + else + bytes = -1; + + /* copy the last item or it part or nothing to the begin of the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); */ + i = leaf_copy_boundary_item (dest_bi, src, LAST_TO_FIRST, bytes); + + cpy_num -= i; + if ( cpy_num == 0 ) + return i; + + pos = src_nr_item - cpy_num - i; + if ( cpy_bytes == -1 ) { + /* starting from position 'pos' copy last cpy_num items of SOURCE to begin of DEST */ + leaf_copy_items_entirely (dest_bi, src, LAST_TO_FIRST, pos, cpy_num); + } else { + /* copy last cpy_num-1 items starting from position 'pos+1' of the SOURCE to the begin of the DEST; */ + leaf_copy_items_entirely (dest_bi, src, LAST_TO_FIRST, pos+1, cpy_num-1); + + /* copy part of the item which number is pos to the begin of the DEST */ + leaf_item_bottle (dest_bi, src, LAST_TO_FIRST, pos, cpy_bytes); + } + } + return i; +} + + +/* there are types of coping: from S[0] to L[0], from S[0] to R[0], + from R[0] to L[0]. for each of these we have to define parent and + positions of destination and source buffers */ +static void leaf_define_dest_src_infos (int shift_mode, struct tree_balance * tb, struct buffer_info * dest_bi, + struct buffer_info * src_bi, int * first_last, + struct buffer_head * Snew) +{ +#ifdef CONFIG_REISERFS_CHECK + memset (dest_bi, 0, sizeof (struct buffer_info)); + memset (src_bi, 0, sizeof (struct buffer_info)); +#endif + + /* define dest, src, dest parent, dest position */ + switch (shift_mode) { + case LEAF_FROM_S_TO_L: /* it is used in leaf_shift_left */ + src_bi->tb = tb; + src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path); + src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0); + src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0); /* src->b_item_order */ + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[0]; + dest_bi->bi_parent = tb->FL[0]; + dest_bi->bi_position = get_left_neighbor_position (tb, 0); + *first_last = FIRST_TO_LAST; + break; + + case LEAF_FROM_S_TO_R: /* it is used in leaf_shift_right */ + src_bi->tb = tb; + src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path); + src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0); + src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[0]; + dest_bi->bi_parent = tb->FR[0]; + dest_bi->bi_position = get_right_neighbor_position (tb, 0); + *first_last = LAST_TO_FIRST; + break; + + case LEAF_FROM_R_TO_L: /* it is used in balance_leaf_when_delete */ + src_bi->tb = tb; + src_bi->bi_bh = tb->R[0]; + src_bi->bi_parent = tb->FR[0]; + src_bi->bi_position = get_right_neighbor_position (tb, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->L[0]; + dest_bi->bi_parent = tb->FL[0]; + dest_bi->bi_position = get_left_neighbor_position (tb, 0); + *first_last = FIRST_TO_LAST; + break; + + case LEAF_FROM_L_TO_R: /* it is used in balance_leaf_when_delete */ + src_bi->tb = tb; + src_bi->bi_bh = tb->L[0]; + src_bi->bi_parent = tb->FL[0]; + src_bi->bi_position = get_left_neighbor_position (tb, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = tb->R[0]; + dest_bi->bi_parent = tb->FR[0]; + dest_bi->bi_position = get_right_neighbor_position (tb, 0); + *first_last = LAST_TO_FIRST; + break; + + case LEAF_FROM_S_TO_SNEW: + src_bi->tb = tb; + src_bi->bi_bh = PATH_PLAST_BUFFER (tb->tb_path); + src_bi->bi_parent = PATH_H_PPARENT (tb->tb_path, 0); + src_bi->bi_position = PATH_H_B_ITEM_ORDER (tb->tb_path, 0); + dest_bi->tb = tb; + dest_bi->bi_bh = Snew; + dest_bi->bi_parent = 0; + dest_bi->bi_position = 0; + *first_last = LAST_TO_FIRST; + break; + + default: + reiserfs_panic (0, "vs-10250: leaf_define_dest_src_infos: shift type is unknown (%d)", shift_mode); + } +#ifdef CONFIG_REISERFS_CHECK + if (src_bi->bi_bh == 0 || dest_bi->bi_bh == 0) { + reiserfs_panic (0, "vs-10260: leaf_define_dest_src_etc: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly", + shift_mode, src_bi->bi_bh, dest_bi->bi_bh); + } +#endif +} + + + + +/* copy mov_num items and mov_bytes of the (mov_num-1)th item to + neighbor. Delete them from source */ +int leaf_move_items (int shift_mode, struct tree_balance * tb, int mov_num, int mov_bytes, struct buffer_head * Snew) +{ + int ret_value; + struct buffer_info dest_bi, src_bi; + int first_last; + + leaf_define_dest_src_infos (shift_mode, tb, &dest_bi, &src_bi, &first_last, Snew); + + ret_value = leaf_copy_items (&dest_bi, src_bi.bi_bh, first_last, mov_num, mov_bytes); + + leaf_delete_items (&src_bi, first_last, (first_last == FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) - mov_num), mov_num, mov_bytes); + + + return ret_value; +} + + +/* Shift shift_num items (and shift_bytes of last shifted item if shift_bytes != -1) + from S[0] to L[0] and replace the delimiting key */ +int leaf_shift_left (struct tree_balance * tb, int shift_num, int shift_bytes) +{ + struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path); + int i; + + /* move shift_num (and shift_bytes bytes) items from S[0] to left neighbor L[0] */ + i = leaf_move_items (LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, 0); + + if ( shift_num ) { + if (B_NR_ITEMS (S0) == 0) { /* number of items in S[0] == 0 */ + +#ifdef CONFIG_REISERFS_CHECK + if ( shift_bytes != -1 ) + reiserfs_panic (tb->tb_sb, "vs-10270: leaf_shift_left: S0 is empty now, but shift_bytes != -1 (%d)", shift_bytes); + + if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) { + print_cur_tb ("vs-10275"); + reiserfs_panic (tb->tb_sb, "vs-10275: leaf_shift_left: balance condition corrupted (%c)", tb->tb_mode); + } +#endif + + if (PATH_H_POSITION (tb->tb_path, 1) == 0) + replace_key (tb, tb->CFL[0], tb->lkey[0], PATH_H_PPARENT (tb->tb_path, 0), 0); + +#if 0 + /* change right_delimiting_key field in L0's block header */ + copy_key (B_PRIGHT_DELIM_KEY(tb->L[0]), B_PRIGHT_DELIM_KEY (S0)); +#endif + } else { + /* replace lkey in CFL[0] by 0-th key from S[0]; */ + replace_key (tb, tb->CFL[0], tb->lkey[0], S0, 0); + +#if 0 + /* change right_delimiting_key field in L0's block header */ + copy_key (B_PRIGHT_DELIM_KEY(tb->L[0]), B_N_PKEY (S0, 0)); +#endif +#ifdef CONFIG_REISERFS_CHECK + if (shift_bytes != -1 && !(is_direntry_le_ih (B_N_PITEM_HEAD (S0, 0)) + && !I_ENTRY_COUNT (B_N_PITEM_HEAD (S0, 0)))) { + if (!op_is_left_mergeable (B_N_PKEY (S0, 0), S0->b_size)) { + reiserfs_panic (tb->tb_sb, "vs-10280: leaf_shift_left: item must be mergeable"); + } + } +#endif + } + } + + return i; +} + + + + + +/* CLEANING STOPPED HERE */ + + + + +/* Shift shift_num (shift_bytes) items from S[0] to the right neighbor, and replace the delimiting key */ +int leaf_shift_right( + struct tree_balance * tb, + int shift_num, + int shift_bytes + ) +{ + // struct buffer_head * S0 = PATH_PLAST_BUFFER (tb->tb_path); + int ret_value; + + /* move shift_num (and shift_bytes) items from S[0] to right neighbor R[0] */ + ret_value = leaf_move_items (LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, 0); + + /* replace rkey in CFR[0] by the 0-th key from R[0] */ + if (shift_num) { + replace_key (tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); + +#if 0 + /* change right_delimiting_key field in S0's block header */ + copy_key (B_PRIGHT_DELIM_KEY(S0), B_N_PKEY (tb->R[0], 0)); +#endif + } + + return ret_value; +} + + + +static void leaf_delete_items_entirely (struct buffer_info * bi, + int first, int del_num); +/* If del_bytes == -1, starting from position 'first' delete del_num items in whole in buffer CUR. + If not. + If last_first == 0. Starting from position 'first' delete del_num-1 items in whole. Delete part of body of + the first item. Part defined by del_bytes. Don't delete first item header + If last_first == 1. Starting from position 'first+1' delete del_num-1 items in whole. Delete part of body of + the last item . Part defined by del_bytes. Don't delete last item header. +*/ +void leaf_delete_items (struct buffer_info * cur_bi, int last_first, + int first, int del_num, int del_bytes) +{ + struct buffer_head * bh; + int item_amount = B_NR_ITEMS (bh = cur_bi->bi_bh); + +#ifdef CONFIG_REISERFS_CHECK + if ( !bh ) + reiserfs_panic (0, "leaf_delete_items: 10155: bh is not defined"); + + if ( del_num < 0 ) + reiserfs_panic (0, "leaf_delete_items: 10160: del_num can not be < 0. del_num==%d", del_num); + + if ( first < 0 || first + del_num > item_amount ) + reiserfs_panic (0, "leaf_delete_items: 10165: invalid number of first item to be deleted (%d) or " + "no so much items (%d) to delete (only %d)", first, first + del_num, item_amount); +#endif + + if ( del_num == 0 ) + return; + + if ( first == 0 && del_num == item_amount && del_bytes == -1 ) { + make_empty_node (cur_bi); + do_balance_mark_leaf_dirty (cur_bi->tb, bh, 0); + return; + } + + if ( del_bytes == -1 ) + /* delete del_num items beginning from item in position first */ + leaf_delete_items_entirely (cur_bi, first, del_num); + else { + if ( last_first == FIRST_TO_LAST ) { + /* delete del_num-1 items beginning from item in position first */ + leaf_delete_items_entirely (cur_bi, first, del_num-1); + + /* delete the part of the first item of the bh + do not delete item header + */ + leaf_cut_from_buffer (cur_bi, 0, 0, del_bytes); + } else { + struct item_head * ih; + int len; + + /* delete del_num-1 items beginning from item in position first+1 */ + leaf_delete_items_entirely (cur_bi, first+1, del_num-1); + + if (is_direntry_le_ih (ih = B_N_PITEM_HEAD(bh, B_NR_ITEMS(bh)-1))) /* the last item is directory */ + /* len = numbers of directory entries in this item */ + len = le16_to_cpu (ih->u.ih_entry_count); + else + /* len = body len of item */ + len = le16_to_cpu (ih->ih_item_len); + + /* delete the part of the last item of the bh + do not delete item header + */ + leaf_cut_from_buffer (cur_bi, B_NR_ITEMS(bh)-1, len - del_bytes, del_bytes); + } + } +} + + +/* insert item into the leaf node in position before */ +void leaf_insert_into_buf (struct buffer_info * bi, int before, + struct item_head * inserted_item_ih, + const char * inserted_item_body, + int zeros_number) +{ + struct buffer_head * bh = bi->bi_bh; + int nr; + struct block_head * blkh; + struct item_head * ih; + int i; + int last_loc, unmoved_loc; + char * to; + + + nr = le16_to_cpu ((blkh = B_BLK_HEAD (bh))->blk_nr_item); + +#ifdef CONFIG_REISERFS_CHECK + /* check free space */ + if (le16_to_cpu (blkh->blk_free_space) < le16_to_cpu (inserted_item_ih->ih_item_len) + IH_SIZE) + reiserfs_panic (0, "leaf_insert_into_buf: 10170: " + "not enough free space in block %z, new item %h", + bh, inserted_item_ih); + if (zeros_number > inserted_item_ih->ih_item_len) + reiserfs_panic (0, "vs-10172: leaf_insert_into_buf: " + "zero number == %d, item length == %d", zeros_number, inserted_item_ih->ih_item_len); +#endif /* CONFIG_REISERFS_CHECK */ + + + /* get item new item must be inserted before */ + ih = B_N_PITEM_HEAD (bh, before); + + /* prepare space for the body of new item */ + last_loc = nr ? ih[nr - before - 1].ih_item_location : bh->b_size; + unmoved_loc = before ? (ih-1)->ih_item_location : bh->b_size; + + memmove (bh->b_data + last_loc - inserted_item_ih->ih_item_len, + bh->b_data + last_loc, unmoved_loc - last_loc); + + to = bh->b_data + unmoved_loc - inserted_item_ih->ih_item_len; + memset (to, 0, zeros_number); + to += zeros_number; + + /* copy body to prepared space */ + if (inserted_item_body) + memmove (to, inserted_item_body, inserted_item_ih->ih_item_len - zeros_number); + else + memset(to, '\0', inserted_item_ih->ih_item_len - zeros_number); + + /* insert item header */ + memmove (ih + 1, ih, IH_SIZE * (nr - before)); + memmove (ih, inserted_item_ih, IH_SIZE); + + /* change locations */ + for (i = before; i < nr + 1; i ++) + ih[i-before].ih_item_location = + (unmoved_loc -= ih[i-before].ih_item_len); + + /* sizes, free space, item number */ + blkh->blk_nr_item = cpu_to_le16 (le16_to_cpu (blkh->blk_nr_item) + 1); + blkh->blk_free_space = cpu_to_le16 (le16_to_cpu (blkh->blk_free_space) - + (IH_SIZE + inserted_item_ih->ih_item_len)); + + do_balance_mark_leaf_dirty (bi->tb, bh, 1); + + if (bi->bi_parent) { + B_N_CHILD (bi->bi_parent, bi->bi_position)->dc_size += (IH_SIZE + inserted_item_ih->ih_item_len); + do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0); + } +} + + +/* paste paste_size bytes to affected_item_num-th item. + When item is a directory, this only prepare space for new entries */ +void leaf_paste_in_buffer (struct buffer_info * bi, int affected_item_num, + int pos_in_item, int paste_size, + const char * body, + int zeros_number) +{ + struct buffer_head * bh = bi->bi_bh; + int nr; + struct block_head * blkh; + struct item_head * ih; + int i; + int last_loc, unmoved_loc; + + + nr = le16_to_cpu ((blkh = B_BLK_HEAD(bh))->blk_nr_item); + +#ifdef CONFIG_REISERFS_CHECK + /* check free space */ + if (le16_to_cpu (blkh->blk_free_space) < paste_size) + reiserfs_panic (0, "leaf_paste_in_buffer: 10175: not enough free space: needed %d, available %d", + paste_size, le16_to_cpu (blkh->blk_free_space)); + if (zeros_number > paste_size) { + print_cur_tb ("10177"); + reiserfs_panic (0, "vs-10177: leaf_paste_in_buffer: zero number == %d, paste_size == %d", + zeros_number, paste_size); + } +#endif /* CONFIG_REISERFS_CHECK */ + + + /* item to be appended */ + ih = B_N_PITEM_HEAD(bh, affected_item_num); + + last_loc = ih[nr - affected_item_num - 1].ih_item_location; + unmoved_loc = affected_item_num ? (ih-1)->ih_item_location : bh->b_size; + + /* prepare space */ + memmove (bh->b_data + last_loc - paste_size, bh->b_data + last_loc, + unmoved_loc - last_loc); + + + /* change locations */ + for (i = affected_item_num; i < nr; i ++) + ih[i-affected_item_num].ih_item_location -= paste_size; + + if ( body ) { + if (!is_direntry_le_ih (ih)) { + if (!pos_in_item) { + /* shift data to right */ + memmove (bh->b_data + ih->ih_item_location + paste_size, + bh->b_data + ih->ih_item_location, ih->ih_item_len); + /* paste data in the head of item */ + memset (bh->b_data + ih->ih_item_location, 0, zeros_number); + memcpy (bh->b_data + ih->ih_item_location + zeros_number, body, paste_size - zeros_number); + } else { + memset (bh->b_data + unmoved_loc - paste_size, 0, zeros_number); + memcpy (bh->b_data + unmoved_loc - paste_size + zeros_number, body, paste_size - zeros_number); + } + } + } + else + memset(bh->b_data + unmoved_loc - paste_size,'\0',paste_size); + + ih->ih_item_len += paste_size; + + /* change free space */ + blkh->blk_free_space = cpu_to_le16 (le16_to_cpu (blkh->blk_free_space) - paste_size); + + do_balance_mark_leaf_dirty (bi->tb, bh, 0); + + if (bi->bi_parent) { + B_N_CHILD (bi->bi_parent, bi->bi_position)->dc_size += paste_size; + do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0); + } +} + + +/* cuts DEL_COUNT entries beginning from FROM-th entry. Directory item + does not have free space, so it moves DEHs and remaining records as + necessary. Return value is size of removed part of directory item + in bytes. */ +static int leaf_cut_entries ( + struct buffer_head * bh, + struct item_head * ih, + int from, + int del_count + ) +{ + char * item; + struct reiserfs_de_head * deh; + int prev_record_offset; /* offset of record, that is (from-1)th */ + char * prev_record; /* */ + int cut_records_len; /* length of all removed records */ + int i; + + +#ifdef CONFIG_REISERFS_CHECK + /* make sure, that item is directory and there are enough entries to + remove */ + if (!is_direntry_le_ih (ih)) + reiserfs_panic (0, "leaf_cut_entries: 10180: item is not directory item"); + + if (I_ENTRY_COUNT(ih) < from + del_count) + reiserfs_panic (0, "leaf_cut_entries: 10185: item contains not enough entries: entry_cout = %d, from = %d, to delete = %d", + I_ENTRY_COUNT(ih), from, del_count); +#endif + + if (del_count == 0) + return 0; + + /* first byte of item */ + item = bh->b_data + ih->ih_item_location; + + /* entry head array */ + deh = B_I_DEH (bh, ih); + + /* first byte of remaining entries, those are BEFORE cut entries + (prev_record) and length of all removed records (cut_records_len) */ + prev_record_offset = (from ? deh[from - 1].deh_location : ih->ih_item_len); + cut_records_len = prev_record_offset/*from_record*/ - deh[from + del_count - 1].deh_location; + prev_record = item + prev_record_offset; + + + /* adjust locations of remaining entries */ + for (i = I_ENTRY_COUNT(ih) - 1; i > from + del_count - 1; i --) + deh[i].deh_location -= (DEH_SIZE * del_count); + + for (i = 0; i < from; i ++) + deh[i].deh_location -= DEH_SIZE * del_count + cut_records_len; + + I_ENTRY_COUNT(ih) -= del_count; + + /* shift entry head array and entries those are AFTER removed entries */ + memmove ((char *)(deh + from), + deh + from + del_count, + prev_record - cut_records_len - (char *)(deh + from + del_count)); + + /* shift records, those are BEFORE removed entries */ + memmove (prev_record - cut_records_len - DEH_SIZE * del_count, + prev_record, item + ih->ih_item_len - prev_record); + + return DEH_SIZE * del_count + cut_records_len; +} + + +/* when cut item is part of regular file + pos_in_item - first byte that must be cut + cut_size - number of bytes to be cut beginning from pos_in_item + + when cut item is part of directory + pos_in_item - number of first deleted entry + cut_size - count of deleted entries + */ +void leaf_cut_from_buffer (struct buffer_info * bi, int cut_item_num, + int pos_in_item, int cut_size) +{ + int nr; + struct buffer_head * bh = bi->bi_bh; + struct block_head * blkh; + struct item_head * ih; + int last_loc, unmoved_loc; + int i; + + nr = le16_to_cpu ((blkh = B_BLK_HEAD (bh))->blk_nr_item); + + /* item head of truncated item */ + ih = B_N_PITEM_HEAD (bh, cut_item_num); + + if (is_direntry_le_ih (ih)) { + /* first cut entry ()*/ + cut_size = leaf_cut_entries (bh, ih, pos_in_item, cut_size); + if (pos_in_item == 0) { + /* change key */ +#ifdef CONFIG_REISERFS_CHECK + if (cut_item_num) + reiserfs_panic (0, "leaf_cut_from_buffer: 10190: " + "when 0-th enrty of item is cut, that item must be first in the node, not %d-th", cut_item_num); +#endif + /* change item key by key of first entry in the item */ + set_le_ih_k_offset (ih, le32_to_cpu (B_I_DEH (bh, ih)->deh_offset)); + /*memcpy (&ih->ih_key.k_offset, &(B_I_DEH (bh, ih)->deh_offset), SHORT_KEY_SIZE);*/ + } + } else { + /* item is direct or indirect */ +#ifdef CONFIG_REISERFS_CHECK + if (is_statdata_le_ih (ih)) + reiserfs_panic (0, "leaf_cut_from_buffer: 10195: item is stat data"); + + if (pos_in_item && pos_in_item + cut_size != le16_to_cpu (ih->ih_item_len) ) + reiserfs_panic (0, "cut_from_buf: 10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)", + pos_in_item, cut_size, le16_to_cpu (ih->ih_item_len)); +#endif + + /* shift item body to left if cut is from the head of item */ + if (pos_in_item == 0) { + memmove (bh->b_data + le16_to_cpu (ih->ih_item_location), bh->b_data + le16_to_cpu (ih->ih_item_location) + cut_size, + le16_to_cpu (ih->ih_item_len) - cut_size); + + /* change key of item */ + if (is_direct_le_ih (ih)) + set_le_ih_k_offset (ih, le_ih_k_offset (ih) + cut_size); + else { + set_le_ih_k_offset (ih, le_ih_k_offset (ih) + (cut_size / UNFM_P_SIZE) * bh->b_size); +#ifdef CONFIG_REISERFS_CHECK + if ( le16_to_cpu (ih->ih_item_len) == cut_size && get_ih_free_space (ih) ) + reiserfs_panic (0, "leaf_cut_from_buf: 10205: invalid ih_free_space (%h)", ih); +#endif + } + } + } + + + /* location of the last item */ + last_loc = le16_to_cpu (ih[nr - cut_item_num - 1].ih_item_location); + + /* location of the item, which is remaining at the same place */ + unmoved_loc = cut_item_num ? le16_to_cpu ((ih-1)->ih_item_location) : bh->b_size; + + + /* shift */ + memmove (bh->b_data + last_loc + cut_size, bh->b_data + last_loc, + unmoved_loc - last_loc - cut_size); + + /* change item length */ +/* ih->ih_item_len -= cut_size;*/ + ih->ih_item_len = cpu_to_le16 (le16_to_cpu (ih->ih_item_len) - cut_size); + + if (is_indirect_le_ih (ih)) { + if (pos_in_item) + set_ih_free_space (ih, 0); + } + + /* change locations */ + for (i = cut_item_num; i < nr; i ++) +/* ih[i-cut_item_num].ih_item_location += cut_size;*/ + ih[i-cut_item_num].ih_item_location = + cpu_to_le16 (le16_to_cpu (ih[i-cut_item_num].ih_item_location) + cut_size); + + /* size, free space */ + blkh->blk_free_space = cpu_to_le16 (le16_to_cpu (blkh->blk_free_space) + cut_size); + + do_balance_mark_leaf_dirty (bi->tb, bh, 0); + + if (bi->bi_parent) { + B_N_CHILD (bi->bi_parent, bi->bi_position)->dc_size -= cut_size; + do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0); + } +} + + +/* delete del_num items from buffer starting from the first'th item */ +static void leaf_delete_items_entirely (struct buffer_info * bi, + int first, int del_num) +{ + struct buffer_head * bh = bi->bi_bh; + int nr; + int i, j; + int last_loc, last_removed_loc; + struct block_head * blkh; + struct item_head * ih; + +#ifdef CONFIG_REISERFS_CHECK + if (bh == NULL) + reiserfs_panic (0, "leaf_delete_items_entirely: 10210: buffer is 0"); + + if (del_num < 0) + reiserfs_panic (0, "leaf_delete_items_entirely: 10215: del_num less than 0 (%d)", del_num); +#endif /* CONFIG_REISERFS_CHECK */ + + if (del_num == 0) + return; + + nr = le16_to_cpu ((blkh = B_BLK_HEAD(bh))->blk_nr_item); + +#ifdef CONFIG_REISERFS_CHECK + if (first < 0 || first + del_num > nr) + reiserfs_panic (0, "leaf_delete_items_entirely: 10220: first=%d, number=%d, there is %d items", first, del_num, nr); +#endif /* CONFIG_REISERFS_CHECK */ + + if (first == 0 && del_num == nr) { + /* this does not work */ + make_empty_node (bi); + + do_balance_mark_leaf_dirty (bi->tb, bh, 0); + return; + } + + ih = B_N_PITEM_HEAD (bh, first); + + /* location of unmovable item */ + j = (first == 0) ? bh->b_size : (ih-1)->ih_item_location; + + /* delete items */ + last_loc = ih[nr-1-first].ih_item_location; + last_removed_loc = ih[del_num-1].ih_item_location; + + memmove (bh->b_data + last_loc + j - last_removed_loc, + bh->b_data + last_loc, last_removed_loc - last_loc); + + /* delete item headers */ + memmove (ih, ih + del_num, (nr - first - del_num) * IH_SIZE); + + /* change item location */ + for (i = first; i < nr - del_num; i ++) + ih[i-first].ih_item_location += j - last_removed_loc; + + /* sizes, item number */ + blkh->blk_nr_item = cpu_to_le16 (le16_to_cpu (blkh->blk_nr_item) - del_num); + blkh->blk_free_space = cpu_to_le16 (le16_to_cpu (blkh->blk_free_space) + (j - last_removed_loc + IH_SIZE * del_num)); + + do_balance_mark_leaf_dirty (bi->tb, bh, 0); + + if (bi->bi_parent) { + B_N_CHILD (bi->bi_parent, bi->bi_position)->dc_size -= j - last_removed_loc + IH_SIZE * del_num; + do_balance_mark_internal_dirty (bi->tb, bi->bi_parent, 0); + } +} + + + + + +/* paste new_entry_count entries (new_dehs, records) into position before to item_num-th item */ +void leaf_paste_entries ( + struct buffer_head * bh, + int item_num, + int before, + int new_entry_count, + struct reiserfs_de_head * new_dehs, + const char * records, + int paste_size + ) +{ + struct item_head * ih; + char * item; + struct reiserfs_de_head * deh; + char * insert_point; + int i, old_entry_num; + + if (new_entry_count == 0) + return; + + ih = B_N_PITEM_HEAD(bh, item_num); + +#ifdef CONFIG_REISERFS_CHECK + /* make sure, that item is directory, and there are enough records in it */ + if (!is_direntry_le_ih (ih)) + reiserfs_panic (0, "leaf_paste_entries: 10225: item is not directory item"); + + if (I_ENTRY_COUNT (ih) < before) + reiserfs_panic (0, "leaf_paste_entries: 10230: there are no entry we paste entries before. entry_count = %d, before = %d", + I_ENTRY_COUNT (ih), before); +#endif + + + /* first byte of dest item */ + item = bh->b_data + ih->ih_item_location; + + /* entry head array */ + deh = B_I_DEH (bh, ih); + + /* new records will be pasted at this point */ + insert_point = item + (before ? deh[before - 1].deh_location : (ih->ih_item_len - paste_size)); + + /* adjust locations of records that will be AFTER new records */ + for (i = I_ENTRY_COUNT(ih) - 1; i >= before; i --) + deh[i].deh_location += DEH_SIZE * new_entry_count; + + /* adjust locations of records that will be BEFORE new records */ + for (i = 0; i < before; i ++) + deh[i].deh_location += paste_size; + + old_entry_num = I_ENTRY_COUNT(ih); + I_ENTRY_COUNT(ih) += new_entry_count; + + /* prepare space for pasted records */ + memmove (insert_point + paste_size, insert_point, item + (ih->ih_item_len - paste_size) - insert_point); + + /* copy new records */ + memcpy (insert_point + DEH_SIZE * new_entry_count, records, + paste_size - DEH_SIZE * new_entry_count); + + /* prepare space for new entry heads */ + deh += before; + memmove ((char *)(deh + new_entry_count), deh, insert_point - (char *)deh); + + /* copy new entry heads */ + deh = (struct reiserfs_de_head *)((char *)deh); + memcpy (deh, new_dehs, DEH_SIZE * new_entry_count); + + /* set locations of new records */ + for (i = 0; i < new_entry_count; i ++) + deh[i].deh_location += + (- new_dehs[new_entry_count - 1].deh_location + insert_point + DEH_SIZE * new_entry_count - item); + + + /* change item key if neccessary (when we paste before 0-th entry */ + if (!before) + { +#ifdef CONFIG_REISERFS_CHECK +/* + if ( old_entry_num && COMP_SHORT_KEYS ((unsigned long *)&ih->ih_key.k_offset, + &(new_dehs->deh_offset)) <= 0) + reiserfs_panic (0, "leaf_paste_entries: 10235: new key must be less, that old key"); +*/ +#endif + set_le_ih_k_offset (ih, le32_to_cpu (new_dehs->deh_offset)); +/* memcpy (&ih->ih_key.k_offset, + &new_dehs->deh_offset, SHORT_KEY_SIZE);*/ + } + +#ifdef CONFIG_REISERFS_CHECK + { + int prev, next; + /* check record locations */ + deh = B_I_DEH (bh, ih); + for (i = 0; i < I_ENTRY_COUNT(ih); i ++) { + next = (i < I_ENTRY_COUNT(ih) - 1) ? deh[i + 1].deh_location : 0; + prev = (i != 0) ? deh[i - 1].deh_location : 0; + + if (prev && prev <= deh[i].deh_location) + reiserfs_warning ("vs-10240: leaf_paste_entries: directory item (%h) corrupted (prev %a, cur(%d) %a)\n", + ih, deh + i - 1, i, deh + i); + if (next && next >= deh[i].deh_location) + reiserfs_warning ("vs-10250: leaf_paste_entries: directory item (%h) corrupted (cur(%d) %a, next %a)\n", + ih, i, deh + i, deh + i + 1); + } + } +#endif + +} + + + diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/namei.c linux/fs/reiserfs/namei.c --- v2.4.0/linux/fs/reiserfs/namei.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/namei.c Mon Jan 15 15:31:19 2001 @@ -0,0 +1,1221 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#ifdef __KERNEL__ + +#include +#include +#include +#include + +#else + +#include "nokernel.h" + +#endif + + /* there should be an overview right + here, as there should be in every + conceptual grouping of code. This + should be combined with dir.c and + called dir.c (naming will become + too large to be called one file in + a few years), stop senselessly + imitating the incoherent + structuring of code used by other + filesystems. */ + +#define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { i->i_nlink++; if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; } +#define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) i->i_nlink--; + +// directory item contains array of entry headers. This performs +// binary search through that array +static int bin_search_in_dir_item (struct reiserfs_dir_entry * de, loff_t off) +{ + struct item_head * ih = de->de_ih; + struct reiserfs_de_head * deh = de->de_deh; + int rbound, lbound, j; + + lbound = 0; + rbound = I_ENTRY_COUNT (ih) - 1; + + for (j = (rbound + lbound) / 2; lbound <= rbound; j = (rbound + lbound) / 2) { + if (off < deh_offset (deh + j)) { + rbound = j - 1; + continue; + } + if (off > deh_offset (deh + j)) { + lbound = j + 1; + continue; + } + // this is not name found, but matched third key component + de->de_entry_num = j; + return NAME_FOUND; + } + + de->de_entry_num = lbound; + return NAME_NOT_FOUND; +} + + +// comment? maybe something like set de to point to what the path points to? +static inline void set_de_item_location (struct reiserfs_dir_entry * de, struct path * path) +{ + de->de_bh = get_bh (path); + de->de_ih = get_ih (path); + de->de_deh = B_I_DEH (de->de_bh, de->de_ih); + de->de_item_num = PATH_LAST_POSITION (path); +} + + +// de_bh, de_ih, de_deh (points to first element of array), de_item_num is set +inline void set_de_name_and_namelen (struct reiserfs_dir_entry * de) +{ + struct reiserfs_de_head * deh = de->de_deh + de->de_entry_num; + + if (de->de_entry_num >= ih_entry_count (de->de_ih)) + BUG (); + + de->de_entrylen = entry_length (de->de_bh, de->de_ih, de->de_entry_num); + de->de_namelen = de->de_entrylen - (de_with_sd (deh) ? SD_SIZE : 0); + de->de_name = B_I_PITEM (de->de_bh, de->de_ih) + le16_to_cpu (deh->deh_location); + if (de->de_name[de->de_namelen - 1] == 0) + de->de_namelen = strlen (de->de_name); +} + + +// what entry points to +static inline void set_de_object_key (struct reiserfs_dir_entry * de) +{ + if (de->de_entry_num >= ih_entry_count (de->de_ih)) + BUG (); + de->de_dir_id = le32_to_cpu (de->de_deh[de->de_entry_num].deh_dir_id); + de->de_objectid = le32_to_cpu (de->de_deh[de->de_entry_num].deh_objectid); +} + + +static inline void store_de_entry_key (struct reiserfs_dir_entry * de) +{ + struct reiserfs_de_head * deh = de->de_deh + de->de_entry_num; + + if (de->de_entry_num >= ih_entry_count (de->de_ih)) + BUG (); + + /* store key of the found entry */ + de->de_entry_key.version = ITEM_VERSION_1; + de->de_entry_key.on_disk_key.k_dir_id = le32_to_cpu (de->de_ih->ih_key.k_dir_id); + de->de_entry_key.on_disk_key.k_objectid = le32_to_cpu (de->de_ih->ih_key.k_objectid); + set_cpu_key_k_offset (&(de->de_entry_key), deh_offset (deh)); + set_cpu_key_k_type (&(de->de_entry_key), TYPE_DIRENTRY); +} + + +/* We assign a key to each directory item, and place multiple entries +in a single directory item. A directory item has a key equal to the +key of the first directory entry in it. + +This function first calls search_by_key, then, if item whose first +entry matches is not found it looks for the entry inside directory +item found by search_by_key. Fills the path to the entry, and to the +entry position in the item + +*/ + +/* The function is NOT SCHEDULE-SAFE! */ +int search_by_entry_key (struct super_block * sb, struct cpu_key * key, + struct path * path, struct reiserfs_dir_entry * de) +{ + int retval; + + retval = search_item (sb, key, path); + switch (retval) { + case ITEM_NOT_FOUND: + if (!PATH_LAST_POSITION (path)) { + reiserfs_warning ("vs-7000: search_by_entry_key: search_by_key returned item position == 0"); + pathrelse(path) ; + return IO_ERROR ; + } + PATH_LAST_POSITION (path) --; + + case ITEM_FOUND: + break; + + case IO_ERROR: + return retval; + + default: + pathrelse (path); + reiserfs_warning ("vs-7002: search_by_entry_key: no path to here"); + return IO_ERROR; + } + + set_de_item_location (de, path); + +#ifdef CONFIG_REISERFS_CHECK + if (!is_direntry_le_ih (de->de_ih) || + COMP_SHORT_KEYS (&(de->de_ih->ih_key), key)) { + print_block (de->de_bh, 0, -1, -1); + reiserfs_panic (sb, "vs-7005: search_by_entry_key: found item %h is not directory item or " + "does not belong to the same directory as key %k", de->de_ih, key); + } +#endif /* CONFIG_REISERFS_CHECK */ + + /* binary search in directory item by third componen t of the + key. sets de->de_entry_num of de */ + retval = bin_search_in_dir_item (de, cpu_key_k_offset (key)); + path->pos_in_item = de->de_entry_num; + if (retval != NAME_NOT_FOUND) { + // ugly, but rename needs de_bh, de_deh, de_name, de_namelen, de_objectid set + set_de_name_and_namelen (de); + set_de_object_key (de); + } + return retval; +} + + + +/* Keyed 32-bit hash function using TEA in a Davis-Meyer function */ + +/* The third component is hashed, and you can choose from more than + one hash function. Per directory hashes are not yet implemented + but are thought about. This function should be moved to hashes.c + Jedi, please do so. -Hans */ + +static __u32 get_third_component (struct super_block * s, + const char * name, int len) +{ + __u32 res; + + if (!len || (len == 1 && name[0] == '.')) + return DOT_OFFSET; + if (len == 2 && name[0] == '.' && name[1] == '.') + return DOT_DOT_OFFSET; + + res = s->u.reiserfs_sb.s_hash_function (name, len); + + // take bits from 7-th to 30-th including both bounds + res = GET_HASH_VALUE(res); + if (res == 0) + // needed to have no names before "." and ".." those have hash + // value == 0 and generation conters 1 and 2 accordingly + res = 128; + return res + MAX_GENERATION_NUMBER; +} + + +// +// a portion of this function, particularly the VFS interface portion, +// was derived from minix or ext2's analog and evolved as the +// prototype did. You should be able to tell which portion by looking +// at the ext2 code and comparing. It's subfunctions contain no code +// used as a template unless they are so labeled. +// +static int reiserfs_match (struct reiserfs_dir_entry * de, + const char * name, int namelen) +{ + int retval = NAME_NOT_FOUND; + + if ((namelen == de->de_namelen) && + !memcmp(de->de_name, name, de->de_namelen)) + retval = (de_visible (de->de_deh + de->de_entry_num) ? NAME_FOUND : NAME_FOUND_INVISIBLE); + + return retval; +} + + +/* de's de_bh, de_ih, de_deh, de_item_num, de_entry_num are set already */ + + /* used when hash collisions exist */ + + +static int linear_search_in_dir_item (struct cpu_key * key, struct reiserfs_dir_entry * de, + const char * name, int namelen) +{ + struct reiserfs_de_head * deh = de->de_deh; + int retval; + int i; + + i = de->de_entry_num; + + if (i == I_ENTRY_COUNT (de->de_ih) || + GET_HASH_VALUE (deh_offset (deh + i)) != GET_HASH_VALUE (cpu_key_k_offset (key))) { + i --; + } + +#ifdef CONFIG_REISERFS_CHECK + if (de->de_deh != B_I_DEH (de->de_bh, de->de_ih)) + reiserfs_panic (0, "vs-7010: linear_search_in_dir_item: array of entry headers not found"); +#endif /* CONFIG_REISERFS_CHECK */ + + deh += i; + + for (; i >= 0; i --, deh --) { + if (GET_HASH_VALUE (deh_offset (deh)) != + GET_HASH_VALUE (cpu_key_k_offset (key))) { + // hash value does not match, no need to check whole name + return NAME_NOT_FOUND; + } + + /* mark, that this generation number is used */ + if (de->de_gen_number_bit_string) + set_bit (GET_GENERATION_NUMBER (deh_offset (deh)), de->de_gen_number_bit_string); + + // calculate pointer to name and namelen + de->de_entry_num = i; + set_de_name_and_namelen (de); + + if ((retval = reiserfs_match (de, name, namelen)) != NAME_NOT_FOUND) { + // de's de_name, de_namelen, de_recordlen are set. Fill the rest: + + // key of pointed object + set_de_object_key (de); + + store_de_entry_key (de); + + // retval can be NAME_FOUND or NAME_FOUND_INVISIBLE + return retval; + } + } + + if (GET_GENERATION_NUMBER (le_ih_k_offset (de->de_ih)) == 0) + /* we have reached left most entry in the node. In common we + have to go to the left neighbor, but if generation counter + is 0 already, we know for sure, that there is no name with + the same hash value */ + // FIXME: this work correctly only because hash value can not + // be 0. Btw, in case of Yura's hash it is probably possible, + // so, this is a bug + return NAME_NOT_FOUND; + +#ifdef CONFIG_REISERFS_CHECK + if (de->de_item_num) + reiserfs_panic (0, "vs-7015: linear_search_in_dir_item: " + "two diritems of the same directory in one node?"); +#endif /* CONFIG_REISERFS_CHECK */ + + return GOTO_PREVIOUS_ITEM; +} + + +// +// a portion of this function, particularly the VFS interface portion, +// was derived from minix or ext2's analog and evolved as the +// prototype did. You should be able to tell which portion by looking +// at the ext2 code and comparing. It's subfunctions contain no code +// used as a template unless they are so labeled. +// +// may return NAME_FOUND, NAME_FOUND_INVISIBLE, NAME_NOT_FOUND +// FIXME: should add something like IOERROR +static int reiserfs_find_entry (struct inode * dir, const char * name, int namelen, + struct path * path_to_entry, struct reiserfs_dir_entry * de) +{ + struct cpu_key key_to_search; + int retval; + + + if (namelen > REISERFS_MAX_NAME_LEN (dir->i_sb->s_blocksize)) + return NAME_NOT_FOUND; + + /* we will search for this key in the tree */ + make_cpu_key (&key_to_search, dir, + get_third_component (dir->i_sb, name, namelen), TYPE_DIRENTRY, 3); + + while (1) { + retval = search_by_entry_key (dir->i_sb, &key_to_search, path_to_entry, de); + if (retval == IO_ERROR) + // FIXME: still has to be dealt with + + /* I want you to conform to our error + printing standard. How many times + do I have to ask? -Hans */ + + BUG (); + + /* compare names for all entries having given hash value */ + retval = linear_search_in_dir_item (&key_to_search, de, name, namelen); + if (retval != GOTO_PREVIOUS_ITEM) { + /* there is no need to scan directory anymore. Given entry found or does not exist */ + path_to_entry->pos_in_item = de->de_entry_num; + return retval; + } + + /* there is left neighboring item of this directory and given entry can be there */ + set_cpu_key_k_offset (&key_to_search, le_ih_k_offset (de->de_ih) - 1); + pathrelse (path_to_entry); + + } /* while (1) */ +} + + +// +// a portion of this function, particularly the VFS interface portion, +// was derived from minix or ext2's analog and evolved as the +// prototype did. You should be able to tell which portion by looking +// at the ext2 code and comparing. It's subfunctions contain no code +// used as a template unless they are so labeled. +// +struct dentry * reiserfs_lookup (struct inode * dir, struct dentry * dentry) +{ + int retval; + struct inode * inode = 0; + struct reiserfs_dir_entry de; + INITIALIZE_PATH (path_to_entry); + + reiserfs_check_lock_depth("lookup") ; + + if (dentry->d_name.len > REISERFS_MAX_NAME_LEN (dir->i_sb->s_blocksize)) + return ERR_PTR(-ENAMETOOLONG); + + de.de_gen_number_bit_string = 0; + retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path_to_entry, &de); + pathrelse (&path_to_entry); + if (retval == NAME_FOUND) { + inode = reiserfs_iget (dir->i_sb, (struct cpu_key *)&(de.de_dir_id)); + if (!inode) { + return ERR_PTR(-EACCES); + } + } + + d_add(dentry, inode); + return NULL; +} + + +// +// a portion of this function, particularly the VFS interface portion, +// was derived from minix or ext2's analog and evolved as the +// prototype did. You should be able to tell which portion by looking +// at the ext2 code and comparing. It's subfunctions contain no code +// used as a template unless they are so labeled. +// + +/* add entry to the directory (entry can be hidden). + +insert definition of when hidden directories are used here -Hans + + Does not mark dir inode dirty, do it after successesfull call to it */ + +static int reiserfs_add_entry (struct reiserfs_transaction_handle *th, struct inode * dir, + const char * name, int namelen, struct inode * inode, + int visible) +{ + struct cpu_key entry_key; + struct reiserfs_de_head * deh; + INITIALIZE_PATH (path); + struct reiserfs_dir_entry de; + int bit_string [MAX_GENERATION_NUMBER / (sizeof(int) * 8) + 1]; + int gen_number; + char small_buf[32+DEH_SIZE] ; /* 48 bytes now and we avoid kmalloc + if we create file with short name */ + char * buffer; + int buflen, paste_size; + int retval; + + + /* cannot allow items to be added into a busy deleted directory */ + if (!namelen) + return -EINVAL; + + if (namelen > REISERFS_MAX_NAME_LEN (dir->i_sb->s_blocksize)) + return -ENAMETOOLONG; + + /* each entry has unique key. compose it */ + make_cpu_key (&entry_key, dir, + get_third_component (dir->i_sb, name, namelen), TYPE_DIRENTRY, 3); + + /* get memory for composing the entry */ + buflen = DEH_SIZE + ROUND_UP (namelen); + if (buflen > sizeof (small_buf)) { + buffer = reiserfs_kmalloc (buflen, GFP_BUFFER, dir->i_sb); + if (buffer == 0) + return -ENOMEM; + } else + buffer = small_buf; + + paste_size = (old_format_only (dir->i_sb)) ? (DEH_SIZE + namelen) : buflen; + + /* fill buffer : directory entry head, name[, dir objectid | , stat data | ,stat data, dir objectid ] */ + deh = (struct reiserfs_de_head *)buffer; + deh->deh_location = 0; + deh->deh_offset = cpu_to_le32 (cpu_key_k_offset (&entry_key)); + deh->deh_state = 0; + /* put key (ino analog) to de */ + deh->deh_dir_id = INODE_PKEY (inode)->k_dir_id; + deh->deh_objectid = INODE_PKEY (inode)->k_objectid; + + /* copy name */ + memcpy ((char *)(deh + 1), name, namelen); + /* padd by 0s to the 4 byte boundary */ + padd_item ((char *)(deh + 1), ROUND_UP (namelen), namelen); + + /* entry is ready to be pasted into tree, set 'visibility' and 'stat data in entry' attributes */ + mark_de_without_sd (deh); + visible ? mark_de_visible (deh) : mark_de_hidden (deh); + + /* find the proper place for the new entry */ + memset (bit_string, 0, sizeof (bit_string)); + de.de_gen_number_bit_string = (char *)bit_string; + if (reiserfs_find_entry (dir, name, namelen, &path, &de) == NAME_FOUND) { + if (buffer != small_buf) + reiserfs_kfree (buffer, buflen, dir->i_sb); + pathrelse (&path); + return -EEXIST; + } + + if (find_first_nonzero_bit (bit_string, MAX_GENERATION_NUMBER + 1) < MAX_GENERATION_NUMBER + 1) { + /* there are few names with given hash value */ + gen_number = find_first_zero_bit (bit_string, MAX_GENERATION_NUMBER + 1); + if (gen_number > MAX_GENERATION_NUMBER) { + /* there is no free generation number */ + reiserfs_warning ("reiserfs_add_entry: Congratulations! we have got hash function screwed up\n"); + if (buffer != small_buf) + reiserfs_kfree (buffer, buflen, dir->i_sb); + pathrelse (&path); + return -EHASHCOLLISION;//EBADSLT + } + /* adjust offset of directory enrty */ + deh->deh_offset = cpu_to_le32 (SET_GENERATION_NUMBER (deh_offset (deh), gen_number)); + set_cpu_key_k_offset (&entry_key, le32_to_cpu (deh->deh_offset)); + + /* find place for new entry */ + if (search_by_entry_key (dir->i_sb, &entry_key, &path, &de) == NAME_FOUND) { + reiserfs_warning ("vs-7032: reiserfs_add_entry: " + "entry with this key (%k) already exists", &entry_key); + if (buffer != small_buf) + reiserfs_kfree (buffer, buflen, dir->i_sb); + pathrelse (&path); + return -EHASHCOLLISION; + } + } else { + deh->deh_offset = cpu_to_le32 (SET_GENERATION_NUMBER (le32_to_cpu (deh->deh_offset), 0)); + set_cpu_key_k_offset (&entry_key, le32_to_cpu (deh->deh_offset)); + } + + /* perform the insertion of the entry that we have prepared */ + retval = reiserfs_paste_into_item (th, &path, &entry_key, buffer, paste_size); + if (buffer != small_buf) + reiserfs_kfree (buffer, buflen, dir->i_sb); + if (retval) { + reiserfs_check_path(&path) ; + return retval; + } + + dir->i_size += paste_size; + dir->i_blocks = ((dir->i_size + 511) >> 9); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + if (!S_ISDIR (inode->i_mode) && visible) + // reiserfs_mkdir or reiserfs_rename will do that by itself + reiserfs_update_sd (th, dir); + + reiserfs_check_path(&path) ; + return 0; +} + + +// +// a portion of this function, particularly the VFS interface portion, +// was derived from minix or ext2's analog and evolved as the +// prototype did. You should be able to tell which portion by looking +// at the ext2 code and comparing. It's subfunctions contain no code +// used as a template unless they are so labeled. +// +int reiserfs_create (struct inode * dir, struct dentry *dentry, int mode) +{ + int retval; + struct inode * inode; + int windex ; + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 ; + struct reiserfs_transaction_handle th ; + + + inode = get_empty_inode() ; + if (!inode) { + return -ENOMEM ; + } + journal_begin(&th, dir->i_sb, jbegin_count) ; + th.t_caller = "create" ; + windex = push_journal_writer("reiserfs_create") ; + inode = reiserfs_new_inode (&th, dir, mode, 0, 0/*i_size*/, dentry, inode, &retval); + if (!inode) { + pop_journal_writer(windex) ; + journal_end(&th, dir->i_sb, jbegin_count) ; + return retval; + } + + inode->i_op = &reiserfs_file_inode_operations; + inode->i_fop = &reiserfs_file_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations ; + + retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, + inode, 1/*visible*/); + if (retval) { + inode->i_nlink--; + reiserfs_update_sd (&th, inode); + pop_journal_writer(windex) ; + // FIXME: should we put iput here and have stat data deleted + // in the same transactioin + journal_end(&th, dir->i_sb, jbegin_count) ; + iput (inode); + return retval; + } + + d_instantiate(dentry, inode); + pop_journal_writer(windex) ; + journal_end(&th, dir->i_sb, jbegin_count) ; + return 0; +} + + +// +// a portion of this function, particularly the VFS interface portion, +// was derived from minix or ext2's analog and evolved as the +// prototype did. You should be able to tell which portion by looking +// at the ext2 code and comparing. It's subfunctions contain no code +// used as a template unless they are so labeled. +// +int reiserfs_mknod (struct inode * dir, struct dentry *dentry, int mode, int rdev) +{ + int retval; + struct inode * inode; + int windex ; + struct reiserfs_transaction_handle th ; + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; + + inode = get_empty_inode() ; + if (!inode) { + return -ENOMEM ; + } + journal_begin(&th, dir->i_sb, jbegin_count) ; + windex = push_journal_writer("reiserfs_mknod") ; + + inode = reiserfs_new_inode (&th, dir, mode, 0, 0/*i_size*/, dentry, inode, &retval); + if (!inode) { + pop_journal_writer(windex) ; + journal_end(&th, dir->i_sb, jbegin_count) ; + return retval; + } + + init_special_inode(inode, mode, rdev) ; + + //FIXME: needed for block and char devices only + reiserfs_update_sd (&th, inode); + + retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, + inode, 1/*visible*/); + if (retval) { + inode->i_nlink--; + reiserfs_update_sd (&th, inode); + pop_journal_writer(windex) ; + journal_end(&th, dir->i_sb, jbegin_count) ; + iput (inode); + return retval; + } + + d_instantiate(dentry, inode); + pop_journal_writer(windex) ; + journal_end(&th, dir->i_sb, jbegin_count) ; + return 0; +} + + +// +// a portion of this function, particularly the VFS interface portion, +// was derived from minix or ext2's analog and evolved as the +// prototype did. You should be able to tell which portion by looking +// at the ext2 code and comparing. It's subfunctions contain no code +// used as a template unless they are so labeled. +// +int reiserfs_mkdir (struct inode * dir, struct dentry *dentry, int mode) +{ + int retval; + struct inode * inode; + int windex ; + struct reiserfs_transaction_handle th ; + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; + + inode = get_empty_inode() ; + if (!inode) { + return -ENOMEM ; + } + journal_begin(&th, dir->i_sb, jbegin_count) ; + windex = push_journal_writer("reiserfs_mkdir") ; + + /* inc the link count now, so another writer doesn't overflow it while + ** we sleep later on. + */ + INC_DIR_INODE_NLINK(dir) + + mode = S_IFDIR | mode; + inode = reiserfs_new_inode (&th, dir, mode, 0/*symlink*/, + old_format_only (dir->i_sb) ? EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, + dentry, inode, &retval); + if (!inode) { + pop_journal_writer(windex) ; + dir->i_nlink-- ; + journal_end(&th, dir->i_sb, jbegin_count) ; + return retval; + } + + inode->i_op = &reiserfs_dir_inode_operations; + inode->i_fop = &reiserfs_dir_operations; + + // note, _this_ add_entry will not update dir's stat data + retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, + inode, 1/*visible*/); + if (retval) { + inode->i_nlink = 0; + DEC_DIR_INODE_NLINK(dir); + reiserfs_update_sd (&th, inode); + pop_journal_writer(windex) ; + journal_end(&th, dir->i_sb, jbegin_count) ; + iput (inode); + return retval; + } + + // the above add_entry did not update dir's stat data + reiserfs_update_sd (&th, dir); + + d_instantiate(dentry, inode); + pop_journal_writer(windex) ; + journal_end(&th, dir->i_sb, jbegin_count) ; + return 0; +} + +static inline int reiserfs_empty_dir(struct inode *inode) { + /* we can cheat because an old format dir cannot have + ** EMPTY_DIR_SIZE, and a new format dir cannot have + ** EMPTY_DIR_SIZE_V1. So, if the inode is either size, + ** regardless of disk format version, the directory is empty. + */ + if (inode->i_size != EMPTY_DIR_SIZE && + inode->i_size != EMPTY_DIR_SIZE_V1) { + return 0 ; + } + return 1 ; +} + + +// +// a portion of this function, particularly the VFS interface portion, +// was derived from minix or ext2's analog and evolved as the +// prototype did. You should be able to tell which portion by looking +// at the ext2 code and comparing. It's subfunctions contain no code +// used as a template unless they are so labeled. +// +int reiserfs_rmdir (struct inode * dir, struct dentry *dentry) +{ + int retval; + struct inode * inode; + int windex ; + struct reiserfs_transaction_handle th ; + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; + INITIALIZE_PATH (path); + struct reiserfs_dir_entry de; + + + journal_begin(&th, dir->i_sb, jbegin_count) ; + windex = push_journal_writer("reiserfs_rmdir") ; + + de.de_gen_number_bit_string = 0; + if (reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de) == NAME_NOT_FOUND) { + retval = -ENOENT; + goto end_rmdir; + } + inode = dentry->d_inode; + + if (de.de_objectid != inode->i_ino) { + // FIXME: compare key of an object and a key found in the + // entry + retval = -EIO; + goto end_rmdir; + } + if (!reiserfs_empty_dir(inode)) { + retval = -ENOTEMPTY; + goto end_rmdir; + } + + /* cut entry from dir directory */ + retval = reiserfs_cut_from_item (&th, &path, &(de.de_entry_key), dir, + NULL, /* page */ + 0/*new file size - not used here*/); + if (retval < 0) + goto end_rmdir; + + if ( inode->i_nlink != 2 && inode->i_nlink != 1 ) + printk ("reiserfs_rmdir: empty directory has nlink != 2 (%d)\n", inode->i_nlink); + + inode->i_nlink = 0; + inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + reiserfs_update_sd (&th, inode); + + DEC_DIR_INODE_NLINK(dir) + dir->i_size -= (DEH_SIZE + de.de_entrylen); + dir->i_blocks = ((dir->i_size + 511) >> 9); + reiserfs_update_sd (&th, dir); + + pop_journal_writer(windex) ; + journal_end(&th, dir->i_sb, jbegin_count) ; + reiserfs_check_path(&path) ; + return 0; + + end_rmdir: + /* we must release path, because we did not call + reiserfs_cut_from_item, or reiserfs_cut_from_item does not + release path if operation was not complete */ + pathrelse (&path); + pop_journal_writer(windex) ; + journal_end(&th, dir->i_sb, jbegin_count) ; + return retval; +} + + +// +// a portion of this function, particularly the VFS interface portion, +// was derived from minix or ext2's analog and evolved as the +// prototype did. You should be able to tell which portion by looking +// at the ext2 code and comparing. It's subfunctions contain no code +// used as a template unless they are so labeled. +// +int reiserfs_unlink (struct inode * dir, struct dentry *dentry) +{ + int retval; + struct inode * inode; + struct reiserfs_dir_entry de; + INITIALIZE_PATH (path); + int windex ; + struct reiserfs_transaction_handle th ; + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; + + journal_begin(&th, dir->i_sb, jbegin_count) ; + windex = push_journal_writer("reiserfs_unlink") ; + + de.de_gen_number_bit_string = 0; + if (reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de) == NAME_NOT_FOUND) { + retval = -ENOENT; + goto end_unlink; + } + inode = dentry->d_inode; + + if (de.de_objectid != inode->i_ino) { + // FIXME: compare key of an object and a key found in the + // entry + retval = -EIO; + goto end_unlink; + } + + if (!inode->i_nlink) { + printk("reiserfs_unlink: deleting nonexistent file (%s:%lu), %d\n", + kdevname(inode->i_dev), inode->i_ino, inode->i_nlink); + inode->i_nlink = 1; + } + + retval = reiserfs_cut_from_item (&th, &path, &(de.de_entry_key), dir, NULL, 0); + if (retval < 0) + goto end_unlink; + + inode->i_nlink--; + inode->i_ctime = CURRENT_TIME; + reiserfs_update_sd (&th, inode); + + dir->i_size -= (de.de_entrylen + DEH_SIZE); + dir->i_blocks = ((dir->i_size + 511) >> 9); + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + reiserfs_update_sd (&th, dir); + + pop_journal_writer(windex) ; + journal_end(&th, dir->i_sb, jbegin_count) ; + reiserfs_check_path(&path) ; + return 0; + + end_unlink: + pathrelse (&path); + pop_journal_writer(windex) ; + journal_end(&th, dir->i_sb, jbegin_count) ; + reiserfs_check_path(&path) ; + return retval; +} + + +// +// a portion of this function, particularly the VFS interface portion, +// was derived from minix or ext2's analog and evolved as the +// prototype did. You should be able to tell which portion by looking +// at the ext2 code and comparing. It's subfunctions contain no code +// used as a template unless they are so labeled. +// +int reiserfs_symlink (struct inode * dir, struct dentry * dentry, const char * symname) +{ + int retval; + struct inode * inode; + char * name; + int item_len; + int windex ; + struct reiserfs_transaction_handle th ; + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; + + + inode = get_empty_inode() ; + if (!inode) { + return -ENOMEM ; + } + + item_len = ROUND_UP (strlen (symname)); + if (item_len > MAX_ITEM_LEN (dir->i_sb->s_blocksize)) { + iput(inode) ; + return -ENAMETOOLONG; + } + + name = kmalloc (item_len, GFP_BUFFER); + if (!name) { + iput(inode) ; + return -ENOMEM; + } + memcpy (name, symname, strlen (symname)); + padd_item (name, item_len, strlen (symname)); + + journal_begin(&th, dir->i_sb, jbegin_count) ; + windex = push_journal_writer("reiserfs_symlink") ; + + inode = reiserfs_new_inode (&th, dir, S_IFLNK | S_IRWXUGO, name, strlen (symname), dentry, + inode, &retval); + kfree (name); + if (inode == 0) { /* reiserfs_new_inode iputs for us */ + pop_journal_writer(windex) ; + journal_end(&th, dir->i_sb, jbegin_count) ; + return retval; + } + + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &reiserfs_address_space_operations; + + // must be sure this inode is written with this transaction + // + //reiserfs_update_sd (&th, inode, READ_BLOCKS); + + retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, + inode, 1/*visible*/); + if (retval) { + inode->i_nlink--; + reiserfs_update_sd (&th, inode); + pop_journal_writer(windex) ; + journal_end(&th, dir->i_sb, jbegin_count) ; + iput (inode); + return retval; + } + + d_instantiate(dentry, inode); + pop_journal_writer(windex) ; + journal_end(&th, dir->i_sb, jbegin_count) ; + return 0; +} + + +// +// a portion of this function, particularly the VFS interface portion, +// was derived from minix or ext2's analog and evolved as the +// prototype did. You should be able to tell which portion by looking +// at the ext2 code and comparing. It's subfunctions contain no code +// used as a template unless they are so labeled. +// +int reiserfs_link (struct dentry * old_dentry, struct inode * dir, struct dentry * dentry) +{ + int retval; + struct inode *inode = old_dentry->d_inode; + int windex ; + struct reiserfs_transaction_handle th ; + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; + + + if (S_ISDIR(inode->i_mode)) + return -EPERM; + + if (inode->i_nlink >= REISERFS_LINK_MAX) { + //FIXME: sd_nlink is 32 bit for new files + return -EMLINK; + } + + journal_begin(&th, dir->i_sb, jbegin_count) ; + windex = push_journal_writer("reiserfs_link") ; + + /* create new entry */ + retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, + inode, 1/*visible*/); + if (retval) { + pop_journal_writer(windex) ; + journal_end(&th, dir->i_sb, jbegin_count) ; + return retval; + } + + inode->i_nlink++; + inode->i_ctime = CURRENT_TIME; + reiserfs_update_sd (&th, inode); + + atomic_inc(&inode->i_count) ; + d_instantiate(dentry, inode); + pop_journal_writer(windex) ; + journal_end(&th, dir->i_sb, jbegin_count) ; + return 0; +} + + +// de contains information pointing to an entry which +static int de_still_valid (const char * name, int len, struct reiserfs_dir_entry * de) +{ + struct reiserfs_dir_entry tmp = *de; + + // recalculate pointer to name and name length + set_de_name_and_namelen (&tmp); + // FIXME: could check more + if (tmp.de_namelen != len || memcmp (name, de->de_name, len)) + return 0; + return 1; +} + + +static int entry_points_to_object (const char * name, int len, struct reiserfs_dir_entry * de, struct inode * inode) +{ + if (!de_still_valid (name, len, de)) + return 0; + + if (inode) { + if (!de_visible (de->de_deh + de->de_entry_num)) + reiserfs_panic (0, "vs-7042: entry_points_to_object: entry must be visible"); + return (de->de_objectid == inode->i_ino) ? 1 : 0; + } + + /* this must be added hidden entry */ + if (de_visible (de->de_deh + de->de_entry_num)) + reiserfs_panic (0, "vs-7043: entry_points_to_object: entry must be visible"); + + return 1; +} + + +/* sets key of objectid the entry has to point to */ +static void set_ino_in_dir_entry (struct reiserfs_dir_entry * de, struct key * key) +{ + de->de_deh[de->de_entry_num].deh_dir_id = key->k_dir_id; + de->de_deh[de->de_entry_num].deh_objectid = key->k_objectid; +} + + +// +// a portion of this function, particularly the VFS interface portion, +// was derived from minix or ext2's analog and evolved as the +// prototype did. You should be able to tell which portion by looking +// at the ext2 code and comparing. It's subfunctions contain no code +// used as a template unless they are so labeled. +// + +/* + * process, that is going to call fix_nodes/do_balance must hold only + * one path. If it holds 2 or more, it can get into endless waiting in + * get_empty_nodes or its clones + */ +int reiserfs_rename (struct inode * old_dir, struct dentry *old_dentry, + struct inode * new_dir, struct dentry *new_dentry) +{ + int retval; + INITIALIZE_PATH (old_entry_path); + INITIALIZE_PATH (new_entry_path); + INITIALIZE_PATH (dot_dot_entry_path); + struct item_head new_entry_ih, old_entry_ih ; + struct reiserfs_dir_entry old_de, new_de, dot_dot_de; + struct inode * old_inode, * new_inode; + int windex ; + struct reiserfs_transaction_handle th ; + int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; + + + old_inode = old_dentry->d_inode; + new_inode = new_dentry->d_inode; + + // make sure, that oldname still exists and points to an object we + // are going to rename + old_de.de_gen_number_bit_string = 0; + retval = reiserfs_find_entry (old_dir, old_dentry->d_name.name, old_dentry->d_name.len, + &old_entry_path, &old_de); + pathrelse (&old_entry_path); + if (retval != NAME_FOUND || old_de.de_objectid != old_inode->i_ino) { + // FIXME: IO error is possible here + return -ENOENT; + } + + if (S_ISDIR(old_inode->i_mode)) { + // make sure, that directory being renamed has correct ".." + // and that its new parent directory has not too many links + // already + + if (new_inode) { + if (!reiserfs_empty_dir(new_inode)) { + return -ENOTEMPTY; + } + } + + /* directory is renamed, its parent directory will be changed, + ** so find ".." entry + */ + dot_dot_de.de_gen_number_bit_string = 0; + retval = reiserfs_find_entry (old_inode, "..", 2, &dot_dot_entry_path, &dot_dot_de); + pathrelse (&dot_dot_entry_path); + if (retval != NAME_FOUND) + return -EIO; + + /* inode number of .. must equal old_dir->i_ino */ + if (dot_dot_de.de_objectid != old_dir->i_ino) + return -EIO; + } + + journal_begin(&th, old_dir->i_sb, jbegin_count) ; + windex = push_journal_writer("reiserfs_rename") ; + + /* add new entry (or find the existing one) */ + retval = reiserfs_add_entry (&th, new_dir, new_dentry->d_name.name, new_dentry->d_name.len, + old_inode, 0); + if (retval == -EEXIST) { + // FIXME: is it possible, that new_inode == 0 here? If yes, it + // is not clear how does ext2 handle that + if (!new_inode) { + printk ("reiserfs_rename: new entry is found, new inode == 0\n"); + BUG (); + } + } else if (retval) { + pop_journal_writer(windex) ; + journal_end(&th, old_dir->i_sb, jbegin_count) ; + return retval; + } + + + while (1) { + // look for old name using corresponding entry key (found by reiserfs_find_entry) + if (search_by_entry_key (new_dir->i_sb, &old_de.de_entry_key, &old_entry_path, &old_de) != NAME_FOUND) + BUG (); + + copy_item_head(&old_entry_ih, get_ih(&old_entry_path)) ; + + // look for new name by reiserfs_find_entry + new_de.de_gen_number_bit_string = 0; + retval = reiserfs_find_entry (new_dir, new_dentry->d_name.name, new_dentry->d_name.len, + &new_entry_path, &new_de); + if (retval != NAME_FOUND_INVISIBLE && retval != NAME_FOUND) + BUG (); + + copy_item_head(&new_entry_ih, get_ih(&new_entry_path)) ; + + reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1) ; + + if (S_ISDIR(old_inode->i_mode)) { + if (search_by_entry_key (new_dir->i_sb, &dot_dot_de.de_entry_key, &dot_dot_entry_path, &dot_dot_de) != NAME_FOUND) + BUG (); + // node containing ".." gets into transaction + reiserfs_prepare_for_journal(old_inode->i_sb, dot_dot_de.de_bh, 1) ; + } + /* we should check seals here, not do + this stuff, yes? Then, having + gathered everything into RAM we + should lock the buffers, yes? -Hans */ + /* probably. our rename needs to hold more + ** than one path at once. The seals would + ** have to be written to deal with multi-path + ** issues -chris + */ + /* sanity checking before doing the rename - avoid races many + ** of the above checks could have scheduled. We have to be + ** sure our items haven't been shifted by another process. + */ + if (!entry_points_to_object(new_dentry->d_name.name, + new_dentry->d_name.len, + &new_de, new_inode) || + item_moved(&new_entry_ih, &new_entry_path) || + item_moved(&old_entry_ih, &old_entry_path) || + !entry_points_to_object (old_dentry->d_name.name, + old_dentry->d_name.len, + &old_de, old_inode)) { + reiserfs_restore_prepared_buffer (old_inode->i_sb, new_de.de_bh); + if (S_ISDIR(old_inode->i_mode)) + reiserfs_restore_prepared_buffer (old_inode->i_sb, dot_dot_de.de_bh); +#if 0 + // FIXME: do we need this? shouldn't we simply continue? + run_task_queue(&tq_disk); + current->policy |= SCHED_YIELD; + /*current->counter = 0;*/ + schedule(); +#endif + continue; + } + +#ifdef CONFIG_REISERFS_CHECK + if (S_ISDIR(old_inode->i_mode) && + (!entry_points_to_object ("..", 2, &dot_dot_de, old_dir) || + !reiserfs_buffer_prepared(dot_dot_de.de_bh))) { + // this should be not changed + BUG (); + } +#endif + + break; + } + + /* ok, all the changes can be done in one fell swoop when we + have claimed all the buffers needed.*/ + + mark_de_visible (new_de.de_deh + new_de.de_entry_num); + set_ino_in_dir_entry (&new_de, INODE_PKEY (old_inode)); + journal_mark_dirty (&th, old_dir->i_sb, new_de.de_bh); + + mark_de_hidden (old_de.de_deh + old_de.de_entry_num); + old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME; + new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME; + + if (new_inode) { + // adjust link number of the victim + if (S_ISDIR(new_inode->i_mode)) { + DEC_DIR_INODE_NLINK(new_inode) + } else { + new_inode->i_nlink--; + } + new_inode->i_ctime = CURRENT_TIME; + } + + if (S_ISDIR(old_inode->i_mode)) { + //if (dot_dot_de.de_bh) { + // adjust ".." of renamed directory + set_ino_in_dir_entry (&dot_dot_de, INODE_PKEY (new_dir)); + journal_mark_dirty (&th, new_dir->i_sb, dot_dot_de.de_bh); + + DEC_DIR_INODE_NLINK(old_dir) + if (new_inode) { + if (S_ISDIR(new_inode->i_mode)) { + DEC_DIR_INODE_NLINK(new_inode) + } else { + new_inode->i_nlink--; + } + } else { + INC_DIR_INODE_NLINK(new_dir) + } + } + + // looks like in 2.3.99pre3 brelse is atomic. so we can use pathrelse + pathrelse (&new_entry_path); + pathrelse (&dot_dot_entry_path); + + // FIXME: this reiserfs_cut_from_item's return value may screw up + // anybody, but it will panic if will not be able to find the + // entry. This needs one more clean up + if (reiserfs_cut_from_item (&th, &old_entry_path, &(old_de.de_entry_key), old_dir, NULL, 0) < 0) + reiserfs_warning ("vs-: reiserfs_rename: coudl not cut old name. Fsck later?\n"); + + old_dir->i_size -= DEH_SIZE + old_de.de_entrylen; + old_dir->i_blocks = ((old_dir->i_size + 511) >> 9); + + reiserfs_update_sd (&th, old_dir); + reiserfs_update_sd (&th, new_dir); + if (new_inode) + reiserfs_update_sd (&th, new_inode); + + pop_journal_writer(windex) ; + journal_end(&th, old_dir->i_sb, jbegin_count) ; + return 0; +} + diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/objectid.c linux/fs/reiserfs/objectid.c --- v2.4.0/linux/fs/reiserfs/objectid.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/objectid.c Mon Jan 15 15:31:19 2001 @@ -0,0 +1,211 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include + +#else + +#include "nokernel.h" + +#endif + + +// find where objectid map starts +#define objectid_map(s,rs) (old_format_only (s) ? \ + (__u32 *)((struct reiserfs_super_block_v1 *)rs + 1) :\ + (__u32 *)(rs + 1)) + + +#ifdef CONFIG_REISERFS_CHECK + +static void check_objectid_map (struct super_block * s, __u32 * map) +{ + if (le32_to_cpu (map[0]) != 1) + reiserfs_panic (s, "vs-15010: check_objectid_map: map corrupted"); + + // FIXME: add something else here +} + +#endif + + +/* When we allocate objectids we allocate the first unused objectid. + Each sequence of objectids in use (the odd sequences) is followed + by a sequence of objectids not in use (the even sequences). We + only need to record the last objectid in each of these sequences + (both the odd and even sequences) in order to fully define the + boundaries of the sequences. A consequence of allocating the first + objectid not in use is that under most conditions this scheme is + extremely compact. The exception is immediately after a sequence + of operations which deletes a large number of objects of + non-sequential objectids, and even then it will become compact + again as soon as more objects are created. Note that many + interesting optimizations of layout could result from complicating + objectid assignment, but we have deferred making them for now. */ + + +/* get unique object identifier */ +__u32 reiserfs_get_unused_objectid (struct reiserfs_transaction_handle *th) +{ + struct super_block * s = th->t_super; + struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s); + __u32 * map = objectid_map (s, rs); + __u32 unused_objectid; + + +#ifdef CONFIG_REISERFS_CHECK + check_objectid_map (s, map); +#endif + + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; + /* comment needed -Hans */ + unused_objectid = le32_to_cpu (map[1]); + if (unused_objectid == U32_MAX) { + printk ("REISERFS: get_objectid: no more object ids\n"); + reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s)) ; + return 0; + } + + /* This incrementation allocates the first unused objectid. That + is to say, the first entry on the objectid map is the first + unused objectid, and by incrementing it we use it. See below + where we check to see if we eliminated a sequence of unused + objectids.... */ + map[1] = cpu_to_le32 (unused_objectid + 1); + + /* Now we check to see if we eliminated the last remaining member of + the first even sequence (and can eliminate the sequence by + eliminating its last objectid from oids), and can collapse the + first two odd sequences into one sequence. If so, then the net + result is to eliminate a pair of objectids from oids. We do this + by shifting the entire map to the left. */ + if (le16_to_cpu (rs->s_oid_cursize) > 2 && map[1] == map[2]) { + memmove (map + 1, map + 3, (le16_to_cpu (rs->s_oid_cursize) - 3) * sizeof(__u32)); + //rs->s_oid_cursize -= 2; + rs->s_oid_cursize = cpu_to_le16 (le16_to_cpu (rs->s_oid_cursize) - 2); + } + + journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s)); + s->s_dirt = 1; + return unused_objectid; +} + + +/* makes object identifier unused */ +void reiserfs_release_objectid (struct reiserfs_transaction_handle *th, + __u32 objectid_to_release) +{ + struct super_block * s = th->t_super; + struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s); + __u32 * map = objectid_map (s, rs); + int i = 0; + + //return; +#ifdef CONFIG_REISERFS_CHECK + check_objectid_map (s, map); +#endif + + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; + journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s)); + s->s_dirt = 1; + + + /* start at the beginning of the objectid map (i = 0) and go to + the end of it (i = disk_sb->s_oid_cursize). Linear search is + what we use, though it is possible that binary search would be + more efficient after performing lots of deletions (which is + when oids is large.) We only check even i's. */ + while (i < le16_to_cpu (rs->s_oid_cursize)) { + if (objectid_to_release == le32_to_cpu (map[i])) { + /* This incrementation unallocates the objectid. */ + //map[i]++; + map[i] = cpu_to_le32 (le32_to_cpu (map[i]) + 1); + + /* Did we unallocate the last member of an odd sequence, and can shrink oids? */ + if (map[i] == map[i+1]) { + /* shrink objectid map */ + memmove (map + i, map + i + 2, + (le16_to_cpu (rs->s_oid_cursize) - i - 2) * sizeof (__u32)); + //disk_sb->s_oid_cursize -= 2; + rs->s_oid_cursize = cpu_to_le16 (le16_to_cpu (rs->s_oid_cursize) - 2); + +#ifdef CONFIG_REISERFS_CHECK + if (le16_to_cpu (rs->s_oid_cursize) < 2 || + le16_to_cpu (rs->s_oid_cursize) > le16_to_cpu (rs->s_oid_maxsize)) + reiserfs_panic (s, "vs-15005: reiserfs_release_objectid: " + "objectid map corrupted cur_size == %d (max == %d)", + le16_to_cpu (rs->s_oid_cursize), le16_to_cpu (rs->s_oid_maxsize)); +#endif + } + return; + } + + if (objectid_to_release > le32_to_cpu (map[i]) && + objectid_to_release < le32_to_cpu (map[i + 1])) { + /* size of objectid map is not changed */ + if (objectid_to_release + 1 == le32_to_cpu (map[i + 1])) { + //objectid_map[i+1]--; + map[i + 1] = cpu_to_le32 (le32_to_cpu (map[i + 1]) - 1); + return; + } + + if (rs->s_oid_cursize == rs->s_oid_maxsize) + /* objectid map must be expanded, but there is no space */ + return; + + /* expand the objectid map*/ + memmove (map + i + 3, map + i + 1, + (le16_to_cpu (rs->s_oid_cursize) - i - 1) * sizeof(__u32)); + map[i + 1] = cpu_to_le32 (objectid_to_release); + map[i + 2] = cpu_to_le32 (objectid_to_release + 1); + rs->s_oid_cursize = cpu_to_le16 (le16_to_cpu (rs->s_oid_cursize) + 2); + return; + } + i += 2; + } + + reiserfs_warning ("vs-15010: reiserfs_release_objectid: tried to free free object id (%lu)", + objectid_to_release); +} + + +int reiserfs_convert_objectid_map_v1(struct super_block *s) { + struct reiserfs_super_block *disk_sb = SB_DISK_SUPER_BLOCK (s); + int cur_size = le16_to_cpu(disk_sb->s_oid_cursize) ; + int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2 ; + int old_max = le16_to_cpu(disk_sb->s_oid_maxsize) ; + struct reiserfs_super_block_v1 *disk_sb_v1 ; + __u32 *objectid_map, *new_objectid_map ; + int i ; + + disk_sb_v1=(struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data); + objectid_map = (__u32 *)(disk_sb_v1 + 1) ; + new_objectid_map = (__u32 *)(disk_sb + 1) ; + + if (cur_size > new_size) { + /* mark everyone used that was listed as free at the end of the objectid + ** map + */ + objectid_map[new_size - 1] = objectid_map[cur_size - 1] ; + disk_sb->s_oid_cursize = cpu_to_le16(new_size) ; + } + /* move the smaller objectid map past the end of the new super */ + for (i = new_size - 1 ; i >= 0 ; i--) { + objectid_map[i + (old_max - new_size)] = objectid_map[i] ; + } + + + /* set the max size so we don't overflow later */ + disk_sb->s_oid_maxsize = cpu_to_le16(new_size) ; + + /* finally, zero out the unused chunk of the new super */ + memset(disk_sb->s_unused, 0, sizeof(disk_sb->s_unused)) ; + return 0 ; +} + diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/prints.c linux/fs/reiserfs/prints.c --- v2.4.0/linux/fs/reiserfs/prints.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/prints.c Mon Jan 15 15:31:19 2001 @@ -0,0 +1,881 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include + +#else + +#include "nokernel.h" +#include + +#endif + +#include + +static char error_buf[1024]; +static char fmt_buf[1024]; +static char off_buf[80]; + + +static char * cpu_offset (struct cpu_key * key) +{ + if (cpu_key_k_type(key) == TYPE_DIRENTRY) + sprintf (off_buf, "%Lu(%Lu)", + (unsigned long long)GET_HASH_VALUE (cpu_key_k_offset (key)), + (unsigned long long)GET_GENERATION_NUMBER (cpu_key_k_offset (key))); + else + sprintf (off_buf, "0x%Lx", (unsigned long long)cpu_key_k_offset (key)); + return off_buf; +} + + +static char * le_offset (struct key * key) +{ + int version; + + version = le_key_version (key); + if (le_key_k_type (version, key) == TYPE_DIRENTRY) + sprintf (off_buf, "%Lu(%Lu)", + (unsigned long long)GET_HASH_VALUE (le_key_k_offset (version, key)), + (unsigned long long)GET_GENERATION_NUMBER (le_key_k_offset (version, key))); + else + sprintf (off_buf, "0x%Lx", (unsigned long long)le_key_k_offset (version, key)); + return off_buf; +} + + +static char * cpu_type (struct cpu_key * key) +{ + if (cpu_key_k_type (key) == TYPE_STAT_DATA) + return "SD"; + if (cpu_key_k_type (key) == TYPE_DIRENTRY) + return "DIR"; + if (cpu_key_k_type (key) == TYPE_DIRECT) + return "DIRECT"; + if (cpu_key_k_type (key) == TYPE_INDIRECT) + return "IND"; + return "UNKNOWN"; +} + + +static char * le_type (struct key * key) +{ + int version; + + version = le_key_version (key); + + if (le_key_k_type (version, key) == TYPE_STAT_DATA) + return "SD"; + if (le_key_k_type (version, key) == TYPE_DIRENTRY) + return "DIR"; + if (le_key_k_type (version, key) == TYPE_DIRECT) + return "DIRECT"; + if (le_key_k_type (version, key) == TYPE_INDIRECT) + return "IND"; + return "UNKNOWN"; +} + + +/* %k */ +static void sprintf_le_key (char * buf, struct key * key) +{ + if (key) + sprintf (buf, "[%d %d %s %s]", le32_to_cpu (key->k_dir_id), + le32_to_cpu (key->k_objectid), le_offset (key), le_type (key)); + else + sprintf (buf, "[NULL]"); +} + + +/* %K */ +static void sprintf_cpu_key (char * buf, struct cpu_key * key) +{ + if (key) + sprintf (buf, "[%d %d %s %s]", key->on_disk_key.k_dir_id, + key->on_disk_key.k_objectid, cpu_offset (key), cpu_type (key)); + else + sprintf (buf, "[NULL]"); +} + + +static void sprintf_item_head (char * buf, struct item_head * ih) +{ + if (ih) { + sprintf (buf, "%s", (ih_version (ih) == ITEM_VERSION_2) ? "*NEW* " : "*OLD*"); + sprintf_le_key (buf + strlen (buf), &(ih->ih_key)); + sprintf (buf + strlen (buf), ", item_len %d, item_location %d, " + "free_space(entry_count) %d", + ih->ih_item_len, ih->ih_item_location, ih_free_space (ih)); + } else + sprintf (buf, "[NULL]"); +} + + +static void sprintf_direntry (char * buf, struct reiserfs_dir_entry * de) +{ + char name[20]; + + memcpy (name, de->de_name, de->de_namelen > 19 ? 19 : de->de_namelen); + name [de->de_namelen > 19 ? 19 : de->de_namelen] = 0; + sprintf (buf, "\"%s\"==>[%d %d]", name, de->de_dir_id, de->de_objectid); +} + + +static void sprintf_block_head (char * buf, struct buffer_head * bh) +{ + sprintf (buf, "level=%d, nr_items=%d, free_space=%d rdkey ", + B_LEVEL (bh), B_NR_ITEMS (bh), B_FREE_SPACE (bh)); +#if 0 + if (B_LEVEL (bh) == DISK_LEAF_NODE_LEVEL) + sprintf_le_key (buf + strlen (buf), B_PRIGHT_DELIM_KEY (bh)); +#endif +} + + +static void sprintf_buffer_head (char * buf, struct buffer_head * bh) +{ + sprintf (buf, "dev %s, size %d, blocknr %ld, count %d, list %d, state 0x%lx, page %p, (%s, %s, %s)", + kdevname (bh->b_dev), bh->b_size, bh->b_blocknr, atomic_read (&(bh->b_count)), bh->b_list, + bh->b_state, bh->b_page, + buffer_uptodate (bh) ? "UPTODATE" : "!UPTODATE", + buffer_dirty (bh) ? "DIRTY" : "CLEAN", + buffer_locked (bh) ? "LOCKED" : "UNLOCKED"); +} + + +static void sprintf_disk_child (char * buf, struct disk_child * dc) +{ + sprintf (buf, "[dc_number=%d, dc_size=%u]", dc->dc_block_number, dc->dc_size); +} + + +static char * is_there_reiserfs_struct (char * fmt, int * what, int * skip) +{ + char * k = fmt; + + *skip = 0; + + while (1) { + k = strstr (k, "%"); + if (!k) + break; + if (k && (k[1] == 'k' || k[1] == 'K' || k[1] == 'h' || k[1] == 't' || + k[1] == 'z' || k[1] == 'b' || k[1] == 'y')) { + *what = k[1]; + break; + } + (*skip) ++; + k ++; + } + return k; +} + + +/* debugging reiserfs we used to print out a lot of different + variables, like keys, item headers, buffer heads etc. Values of + most fields matter. So it took a long time just to write + appropriative printk. With this reiserfs_warning you can use format + specification for complex structures like you used to do with + printfs for integers, doubles and pointers. For instance, to print + out key structure you have to write just: + reiserfs_warning ("bad key %k", key); + instead of + printk ("bad key %lu %lu %lu %lu", key->k_dir_id, key->k_objectid, + key->k_offset, key->k_uniqueness); +*/ + +#define do_reiserfs_warning \ +{\ + char * fmt1 = fmt_buf;\ + va_list args;\ + int i, j;\ + char * k;\ + char * p = error_buf;\ + int what, skip;\ +\ + strcpy (fmt1, fmt);\ + va_start(args, fmt);\ +\ + while (1) {\ + k = is_there_reiserfs_struct (fmt1, &what, &skip);\ + if (k != 0) {\ + *k = 0;\ + p += vsprintf (p, fmt1, args);\ +\ + for (i = 0; i < skip; i ++)\ + j = va_arg (args, int);\ +\ + switch (what) {\ + case 'k':\ + sprintf_le_key (p, va_arg(args, struct key *));\ + break;\ + case 'K':\ + sprintf_cpu_key (p, va_arg(args, struct cpu_key *));\ + break;\ + case 'h':\ + sprintf_item_head (p, va_arg(args, struct item_head *));\ + break;\ + case 't':\ + sprintf_direntry (p, va_arg(args, struct reiserfs_dir_entry *));\ + break;\ + case 'y':\ + sprintf_disk_child (p, va_arg(args, struct disk_child *));\ + break;\ + case 'z':\ + sprintf_block_head (p, va_arg(args, struct buffer_head *));\ + break;\ + case 'b':\ + sprintf_buffer_head (p, va_arg(args, struct buffer_head *));\ + break;\ + }\ + p += strlen (p);\ + fmt1 = k + 2;\ + } else {\ + i = vsprintf (p, fmt1, args);\ + break;\ + }\ + }\ +\ + va_end(args);\ +} + + +/* in addition to usual conversion specifiers this accepts reiserfs + specific conversion specifiers: + %k to print little endian key, + %K to print cpu key, + %h to print item_head, + %t to print directory entry + %z to print block head (arg must be struct buffer_head * + %b to print buffer_head +*/ +void reiserfs_warning (const char * fmt, ...) +{ + do_reiserfs_warning; + /* console_print (error_buf); */ + printk ("%s", error_buf); +} + +void reiserfs_debug (struct super_block *s, int level, const char * fmt, ...) +{ +#ifdef CONFIG_REISERFS_CHECK + do_reiserfs_warning; + printk ("%s", error_buf); +#else + ; +#endif +} + +/* The format: + + maintainer-errorid: [function-name:] message + + where errorid is unique to the maintainer and function-name is + optional, is recommended, so that anyone can easily find the bug + with a simple grep for the short to type string + maintainer-errorid. Don't bother with reusing errorids, there are + lots of numbers out there. + + Example: + + reiserfs_panic( + p_sb, "reiser-29: reiserfs_new_blocknrs: " + "one of search_start or rn(%d) is equal to MAX_B_NUM," + "which means that we are optimizing location based on the bogus location of a temp buffer (%p).", + rn, bh + ); + + Regular panic()s sometimes clear the screen before the message can + be read, thus the need for the while loop. + + Numbering scheme for panic used by Vladimir and Anatoly( Hans completely ignores this scheme, and considers it + pointless complexity): + + panics in reiserfs_fs.h have numbers from 1000 to 1999 + super.c 2000 to 2999 + preserve.c 3000 to 3999 + bitmap.c 4000 to 4999 + stree.c 5000 to 5999 + prints.c 6000 to 6999 + namei.c 7000 to 7999 + fix_nodes.c 8000 to 8999 + dir.c 9000 to 9999 + lbalance.c 10000 to 10999 + ibalance.c 11000 to 11999 not ready + do_balan.c 12000 to 12999 + inode.c 13000 to 13999 + file.c 14000 to 14999 + objectid.c 15000 - 15999 + buffer.c 16000 - 16999 + symlink.c 17000 - 17999 + + . */ + + +#ifdef CONFIG_REISERFS_CHECK +extern struct tree_balance * cur_tb; +#endif + +void reiserfs_panic (struct super_block * sb, const char * fmt, ...) +{ +#ifdef __KERNEL__ + show_reiserfs_locks() ; +#endif + do_reiserfs_warning; + printk ("%s", error_buf); + BUG (); + // console_print (error_buf); + // for (;;); + +#ifdef __KERNEL__ + + /* comment before release */ + //for (;;); + +#if 0 /* this is not needed, the state is ignored */ + if (sb && !(sb->s_flags & MS_RDONLY)) { + sb->u.reiserfs_sb.s_mount_state |= REISERFS_ERROR_FS; + sb->u.reiserfs_sb.s_rs->s_state = REISERFS_ERROR_FS; + + mark_buffer_dirty(sb->u.reiserfs_sb.s_sbh) ; + sb->s_dirt = 1; + } +#endif + + /* this is to prevent panic from syncing this filesystem */ + if (sb && sb->s_lock) + sb->s_lock=0; + if (sb) + sb->s_flags |= MS_RDONLY; + + panic ("REISERFS: panic (device %s): %s\n", + sb ? kdevname(sb->s_dev) : "sb == 0", error_buf); +#else + exit (0); +#endif +} + + +void print_virtual_node (struct virtual_node * vn) +{ + int i; + struct virtual_item * vi; + + printk ("VIRTUAL NODE CONTAINS %d items, has size %d,%s,%s, ITEM_POS=%d POS_IN_ITEM=%d MODE=\'%c\'\n", + vn->vn_nr_item, vn->vn_size, + (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE )? "left mergeable" : "", + (vn->vn_vi[vn->vn_nr_item - 1].vi_type & VI_TYPE_RIGHT_MERGEABLE) ? "right mergeable" : "", + vn->vn_affected_item_num, vn->vn_pos_in_item, vn->vn_mode); + + vi = vn->vn_vi; + for (i = 0; i < vn->vn_nr_item; i ++, vi ++) + op_print_vi (vi); + +} + + +void print_path (struct tree_balance * tb, struct path * path) +{ + int h = 0; + struct buffer_head * bh; + + if (tb) { + while (tb->insert_size[h]) { + bh = PATH_H_PBUFFER (path, h); + printk ("block %lu (level=%d), position %d\n", bh ? bh->b_blocknr : 0, + bh ? B_LEVEL (bh) : 0, PATH_H_POSITION (path, h)); + h ++; + } + } else { + int offset = path->path_length; + struct buffer_head * bh; + printk ("Offset Bh (b_blocknr, b_count) Position Nr_item\n"); + while ( offset > ILLEGAL_PATH_ELEMENT_OFFSET ) { + bh = PATH_OFFSET_PBUFFER (path, offset); + printk ("%6d %10p (%9lu, %7d) %8d %7d\n", offset, + bh, bh ? bh->b_blocknr : 0, bh ? atomic_read (&(bh->b_count)) : 0, + PATH_OFFSET_POSITION (path, offset), bh ? B_NR_ITEMS (bh) : -1); + + offset --; + } + } + +} + + +/* this prints internal nodes (4 keys/items in line) (dc_number, + dc_size)[k_dirid, k_objectid, k_offset, k_uniqueness](dc_number, + dc_size)...*/ +static int print_internal (struct buffer_head * bh, int first, int last) +{ + struct key * key; + struct disk_child * dc; + int i; + int from, to; + + if (!B_IS_KEYS_LEVEL (bh)) + return 1; + + check_internal (bh); + + if (first == -1) { + from = 0; + to = B_NR_ITEMS (bh); + } else { + from = first; + to = last < B_NR_ITEMS (bh) ? last : B_NR_ITEMS (bh); + } + + reiserfs_warning ("INTERNAL NODE (%ld) contains %z\n", bh->b_blocknr, bh); + + dc = B_N_CHILD (bh, from); + reiserfs_warning ("PTR %d: %y ", from, dc); + + for (i = from, key = B_N_PDELIM_KEY (bh, from), dc ++; i < to; i ++, key ++, dc ++) { + reiserfs_warning ("KEY %d: %k PTR %d: %y ", i, key, i + 1, dc); + if (i && i % 4 == 0) + printk ("\n"); + } + printk ("\n"); + return 0; +} + + + + + +static int print_leaf (struct buffer_head * bh, int print_mode, int first, int last) +{ + struct block_head * blkh; + struct item_head * ih; + int i; + int from, to; + + if (!B_IS_ITEMS_LEVEL (bh)) + return 1; + + check_leaf (bh); + + blkh = B_BLK_HEAD (bh); + ih = B_N_PITEM_HEAD (bh,0); + + printk ("\n===================================================================\n"); + reiserfs_warning ("LEAF NODE (%ld) contains %z\n", bh->b_blocknr, bh); + + if (!(print_mode & PRINT_LEAF_ITEMS)) { + reiserfs_warning ("FIRST ITEM_KEY: %k, LAST ITEM KEY: %k\n", + &(ih->ih_key), &((ih + le16_to_cpu (blkh->blk_nr_item) - 1)->ih_key)); + return 0; + } + + if (first < 0 || first > le16_to_cpu (blkh->blk_nr_item) - 1) + from = 0; + else + from = first; + + if (last < 0 || last > le16_to_cpu (blkh->blk_nr_item)) + to = le16_to_cpu (blkh->blk_nr_item); + else + to = last; + + ih += from; + printk ("-------------------------------------------------------------------------------\n"); + printk ("|##| type | key | ilen | free_space | version | loc |\n"); + for (i = from; i < to; i++, ih ++) { + printk ("-------------------------------------------------------------------------------\n"); + reiserfs_warning ("|%2d| %h |\n", i, ih); + if (print_mode & PRINT_LEAF_ITEMS) + op_print_item (ih, B_I_PITEM (bh, ih)); + } + + printk ("===================================================================\n"); + + return 0; +} + +static char * reiserfs_version (char * buf) +{ + __u16 * pversion; + + pversion = (__u16 *)(buf) + 36; + if (*pversion == 0) + return "0"; + if (*pversion == 2) + return "2"; + return "Unknown"; +} + + +/* return 1 if this is not super block */ +static int print_super_block (struct buffer_head * bh) +{ + struct reiserfs_super_block * rs = (struct reiserfs_super_block *)(bh->b_data); + int skipped, data_blocks; + + + if (strncmp (rs->s_magic, REISERFS_SUPER_MAGIC_STRING, strlen ( REISERFS_SUPER_MAGIC_STRING)) && + strncmp (rs->s_magic, REISER2FS_SUPER_MAGIC_STRING, strlen ( REISER2FS_SUPER_MAGIC_STRING))) + return 1; + + printk ("%s\'s super block in block %ld\n======================\n", kdevname (bh->b_dev), bh->b_blocknr); + printk ("Reiserfs version %s\n", reiserfs_version (bh->b_data)); + printk ("Block count %u\n", le32_to_cpu (rs->s_block_count)); + printk ("Blocksize %d\n", le16_to_cpu (rs->s_blocksize)); + printk ("Free blocks %u\n", le32_to_cpu (rs->s_free_blocks)); + skipped = bh->b_blocknr; // FIXME: this would be confusing if + // someone stores reiserfs super block in some data block ;) + data_blocks = le32_to_cpu (rs->s_block_count) - skipped - 1 - + le16_to_cpu (rs->s_bmap_nr) - (le32_to_cpu (rs->s_orig_journal_size) + 1) - + le32_to_cpu (rs->s_free_blocks); + printk ("Busy blocks (skipped %d, bitmaps - %d, journal blocks - %d\n" + "1 super blocks, %d data blocks\n", + skipped, le16_to_cpu (rs->s_bmap_nr), + (le32_to_cpu (rs->s_orig_journal_size) + 1), data_blocks); + printk ("Root block %u\n", le32_to_cpu (rs->s_root_block)); + printk ("Journal block (first) %d\n", le32_to_cpu (rs->s_journal_block)); + printk ("Journal dev %d\n", le32_to_cpu (rs->s_journal_dev)); + printk ("Journal orig size %d\n", le32_to_cpu (rs->s_orig_journal_size)); + printk ("Filesystem state %s\n", + (le16_to_cpu (rs->s_state) == REISERFS_VALID_FS) ? "VALID" : "ERROR"); + printk ("Hash function \"%s\"\n", le16_to_cpu (rs->s_hash_function_code) == TEA_HASH ? "tea" : + ((le16_to_cpu (rs->s_hash_function_code) == YURA_HASH) ? "rupasov" : "unknown")); + +#if 0 + __u32 s_journal_trans_max ; /* max number of blocks in a transaction. */ + __u32 s_journal_block_count ; /* total size of the journal. can change over time */ + __u32 s_journal_max_batch ; /* max number of blocks to batch into a trans */ + __u32 s_journal_max_commit_age ; /* in seconds, how old can an async commit be */ + __u32 s_journal_max_trans_age ; /* in seconds, how old can a transaction be */ +#endif + printk ("Tree height %d\n", rs->s_tree_height); + return 0; +} + + +static int print_desc_block (struct buffer_head * bh) +{ + struct reiserfs_journal_desc * desc; + + desc = (struct reiserfs_journal_desc *)(bh->b_data); + if (memcmp(desc->j_magic, JOURNAL_DESC_MAGIC, 8)) + return 1; + + printk ("Desc block %lu (j_trans_id %d, j_mount_id %d, j_len %d)", + bh->b_blocknr, desc->j_trans_id, desc->j_mount_id, desc->j_len); + + return 0; +} + + +void print_block (struct buffer_head * bh, ...)//int print_mode, int first, int last) +{ + va_list args; + int mode, first, last; + + va_start (args, bh); + + if ( ! bh ) { + printk("print_block: buffer is NULL\n"); + return; + } + + mode = va_arg (args, int); + first = va_arg (args, int); + last = va_arg (args, int); + if (print_leaf (bh, mode, first, last)) + if (print_internal (bh, first, last)) + if (print_super_block (bh)) + if (print_desc_block (bh)) + printk ("Block %ld contains unformatted data\n", bh->b_blocknr); +} + + + +char print_tb_buf[2048]; + +/* this stores initial state of tree balance in the print_tb_buf */ +void store_print_tb (struct tree_balance * tb) +{ + int h = 0; + int i; + struct buffer_head * tbSh, * tbFh; + + if (!tb) + return; + + sprintf (print_tb_buf, "\n" + "BALANCING %d\n" + "MODE=%c, ITEM_POS=%d POS_IN_ITEM=%d\n" + "=====================================================================\n" + "* h * S * L * R * F * FL * FR * CFL * CFR *\n", + tb->tb_sb->u.reiserfs_sb.s_do_balance, + tb->tb_mode, PATH_LAST_POSITION (tb->tb_path), tb->tb_path->pos_in_item); + + for (h = 0; h < sizeof(tb->insert_size) / sizeof (tb->insert_size[0]); h ++) { + if (PATH_H_PATH_OFFSET (tb->tb_path, h) <= tb->tb_path->path_length && + PATH_H_PATH_OFFSET (tb->tb_path, h) > ILLEGAL_PATH_ELEMENT_OFFSET) { + tbSh = PATH_H_PBUFFER (tb->tb_path, h); + tbFh = PATH_H_PPARENT (tb->tb_path, h); + } else { + tbSh = 0; + tbFh = 0; + } + sprintf (print_tb_buf + strlen (print_tb_buf), + "* %d * %3ld(%2d) * %3ld(%2d) * %3ld(%2d) * %5ld * %5ld * %5ld * %5ld * %5ld *\n", + h, + (tbSh) ? (tbSh->b_blocknr):(-1), + (tbSh) ? atomic_read (&(tbSh->b_count)) : -1, + (tb->L[h]) ? (tb->L[h]->b_blocknr):(-1), + (tb->L[h]) ? atomic_read (&(tb->L[h]->b_count)) : -1, + (tb->R[h]) ? (tb->R[h]->b_blocknr):(-1), + (tb->R[h]) ? atomic_read (&(tb->R[h]->b_count)) : -1, + (tbFh) ? (tbFh->b_blocknr):(-1), + (tb->FL[h]) ? (tb->FL[h]->b_blocknr):(-1), + (tb->FR[h]) ? (tb->FR[h]->b_blocknr):(-1), + (tb->CFL[h]) ? (tb->CFL[h]->b_blocknr):(-1), + (tb->CFR[h]) ? (tb->CFR[h]->b_blocknr):(-1)); + } + + sprintf (print_tb_buf + strlen (print_tb_buf), + "=====================================================================\n" + "* h * size * ln * lb * rn * rb * blkn * s0 * s1 * s1b * s2 * s2b * curb * lk * rk *\n" + "* 0 * %4d * %2d * %2d * %2d * %2d * %4d * %2d * %2d * %3d * %2d * %3d * %4d * %2d * %2d *\n", + tb->insert_size[0], tb->lnum[0], tb->lbytes, tb->rnum[0],tb->rbytes, tb->blknum[0], + tb->s0num, tb->s1num,tb->s1bytes, tb->s2num, tb->s2bytes, tb->cur_blknum, tb->lkey[0], tb->rkey[0]); + + /* this prints balance parameters for non-leaf levels */ + h = 0; + do { + h++; + sprintf (print_tb_buf + strlen (print_tb_buf), + "* %d * %4d * %2d * * %2d * * %2d *\n", + h, tb->insert_size[h], tb->lnum[h], tb->rnum[h], tb->blknum[h]); + } while (tb->insert_size[h]); + + sprintf (print_tb_buf + strlen (print_tb_buf), + "=====================================================================\n" + "FEB list: "); + + /* print FEB list (list of buffers in form (bh (b_blocknr, b_count), that will be used for new nodes) */ + h = 0; + for (i = 0; i < sizeof (tb->FEB) / sizeof (tb->FEB[0]); i ++) + sprintf (print_tb_buf + strlen (print_tb_buf), + "%p (%lu %d)%s", tb->FEB[i], tb->FEB[i] ? tb->FEB[i]->b_blocknr : 0, + tb->FEB[i] ? atomic_read (&(tb->FEB[i]->b_count)) : 0, + (i == sizeof (tb->FEB) / sizeof (tb->FEB[0]) - 1) ? "\n" : ", "); + + sprintf (print_tb_buf + strlen (print_tb_buf), + "======================== the end ====================================\n"); +} + +void print_cur_tb (char * mes) +{ + printk ("%s\n%s", mes, print_tb_buf); +} + + +#ifndef __KERNEL__ + +void print_bmap_block (int i, char * data, int size, int silent) +{ + int j, k; + int bits = size * 8; + int zeros = 0, ones = 0; + + + if (test_bit (0, data)) { + /* first block addressed by this bitmap block is used */ + ones ++; + if (!silent) + printf ("Busy (%d-", i * bits); + for (j = 1; j < bits; j ++) { + while (test_bit (j, data)) { + ones ++; + if (j == bits - 1) { + if (!silent) + printf ("%d)\n", j + i * bits); + goto end; + } + j++; + } + if (!silent) + printf ("%d) Free(%d-", j - 1 + i * bits, j + i * bits); + + while (!test_bit (j, data)) { + zeros ++; + if (j == bits - 1) { + if (!silent) + printf ("%d)\n", j + i * bits); + goto end; + } + j++; + } + if (!silent) + printf ("%d) Busy(%d-", j - 1 + i * bits, j + i * bits); + + j --; + end: + } + } else { + /* first block addressed by this bitmap is free */ + zeros ++; + if (!silent) + printf ("Free (%d-", i * bits); + for (j = 1; j < bits; j ++) { + k = 0; + while (!test_bit (j, data)) { + k ++; + if (j == bits - 1) { + if (!silent) + printf ("%d)\n", j + i * bits); + zeros += k; + goto end2; + } + j++; + } + zeros += k; + if (!silent) + printf ("%d) Busy(%d-", j - 1 + i * bits, j + i * bits); + + k = 0; + while (test_bit (j, data)) { + ones ++; + if (j == bits - 1) { + if (!silent) + printf ("%d)\n", j + i * bits); + ones += k; + goto end2; + } + j++; + } + ones += k; + if (!silent) + printf ("%d) Busy(%d-", j - 1 + i * bits, j + i * bits); + + j --; + end2: + } + } + + printf ("used %d, free %d\n", ones, zeros); +} + + +/* if silent == 1, do not print details */ +void print_bmap (struct super_block * s, int silent) +{ + int bmapnr = SB_BMAP_NR (s); + int i; + + printf ("Bitmap blocks are:\n"); + for (i = 0; i < bmapnr; i ++) { + printf ("#%d: block %lu: ", i, SB_AP_BITMAP(s)[i]->b_blocknr); + print_bmap_block (i, SB_AP_BITMAP(s)[i]->b_data, s->s_blocksize, silent); + } + +} + + + + +void print_objectid_map (struct super_block * s) +{ + int i; + struct reiserfs_super_block * rs; + unsigned long * omap; + + rs = SB_DISK_SUPER_BLOCK (s); + omap = (unsigned long *)(rs + 1); + printk ("Map of objectids\n"); + + for (i = 0; i < rs->s_oid_cursize; i ++) { + if (i % 2 == 0) + printk ("busy(%lu-%lu) ", omap[i], omap[i+1] - 1); + else + printk ("free(%lu-%lu) ", + omap[i], ((i+1) == rs->s_oid_cursize) ? -1 : omap[i+1] - 1); + } + printk ("\n"); + + printk ("Object id array has size %d (max %d):", rs->s_oid_cursize, + rs->s_oid_maxsize); + + for (i = 0; i < rs->s_oid_cursize; i ++) + printk ("%lu ", omap[i]); + printk ("\n"); + +} + +#endif /* #ifndef __KERNEL__ */ + + +static void check_leaf_block_head (struct buffer_head * bh) +{ + struct block_head * blkh; + + blkh = B_BLK_HEAD (bh); + if (le16_to_cpu (blkh->blk_nr_item) > (bh->b_size - BLKH_SIZE) / IH_SIZE) + reiserfs_panic (0, "vs-6010: check_leaf_block_head: invalid item number %z", bh); + if (le16_to_cpu (blkh->blk_free_space) > + bh->b_size - BLKH_SIZE - IH_SIZE * le16_to_cpu (blkh->blk_nr_item)) + reiserfs_panic (0, "vs-6020: check_leaf_block_head: invalid free space %z", bh); + +} + +static void check_internal_block_head (struct buffer_head * bh) +{ + struct block_head * blkh; + + blkh = B_BLK_HEAD (bh); + if (!(B_LEVEL (bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL (bh) <= MAX_HEIGHT)) + reiserfs_panic (0, "vs-6025: check_internal_block_head: invalid level %z", bh); + + if (B_NR_ITEMS (bh) > (bh->b_size - BLKH_SIZE) / IH_SIZE) + reiserfs_panic (0, "vs-6030: check_internal_block_head: invalid item number %z", bh); + + if (B_FREE_SPACE (bh) != + bh->b_size - BLKH_SIZE - KEY_SIZE * B_NR_ITEMS (bh) - DC_SIZE * (B_NR_ITEMS (bh) + 1)) + reiserfs_panic (0, "vs-6040: check_internal_block_head: invalid free space %z", bh); + +} + + +void check_leaf (struct buffer_head * bh) +{ + int i; + struct item_head * ih; + + if (!bh) + return; + check_leaf_block_head (bh); + for (i = 0, ih = B_N_PITEM_HEAD (bh, 0); i < B_NR_ITEMS (bh); i ++, ih ++) + op_check_item (ih, B_I_PITEM (bh, ih)); +} + + +void check_internal (struct buffer_head * bh) +{ + if (!bh) + return; + check_internal_block_head (bh); +} + + +void print_statistics (struct super_block * s) +{ + + /* + printk ("reiserfs_put_super: session statistics: balances %d, fix_nodes %d, preserve list freeings %d, \ +bmap with search %d, without %d, dir2ind %d, ind2dir %d\n", + s->u.reiserfs_sb.s_do_balance, s->u.reiserfs_sb.s_fix_nodes, s->u.reiserfs_sb.s_preserve_list_freeings, + s->u.reiserfs_sb.s_bmaps, s->u.reiserfs_sb.s_bmaps_without_search, + s->u.reiserfs_sb.s_direct2indirect, s->u.reiserfs_sb.s_indirect2direct); + */ + +} diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/resize.c linux/fs/reiserfs/resize.c --- v2.4.0/linux/fs/reiserfs/resize.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/resize.c Mon Jan 15 12:42:32 2001 @@ -0,0 +1,168 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* + * Written by Alexander Zarochentcev. + * + * The kernel part of the (on-line) reiserfs resizer. + */ + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include +#include + +#else + +#include "nokernel.h" + +#endif + +int reiserfs_resize (struct super_block * s, unsigned long block_count_new) +{ + struct reiserfs_super_block * sb; + struct buffer_head ** bitmap, * bh; + struct reiserfs_transaction_handle th; + unsigned int bmap_nr_new, bmap_nr; + unsigned int block_r_new, block_r; + + struct reiserfs_list_bitmap * jb; + struct reiserfs_list_bitmap jbitmap[JOURNAL_NUM_BITMAPS]; + + unsigned long int block_count, free_blocks; + int i; + int copy_size ; + + sb = SB_DISK_SUPER_BLOCK(s); + + if (SB_BLOCK_COUNT(s) >= block_count_new) { + printk("can\'t shrink filesystem on-line\n"); + return -EINVAL; + } + + /* check the device size */ + bh = bread(s->s_dev, block_count_new - 1, s->s_blocksize); + if (!bh) { + printk("reiserfs_resize: can\'t read last block\n"); + return -EINVAL; + } + bforget(bh); + + /* old disk layout detection; those partitions can be mounted, but + * cannot be resized */ + if (SB_BUFFER_WITH_SB(s)->b_blocknr * SB_BUFFER_WITH_SB(s)->b_size + != REISERFS_DISK_OFFSET_IN_BYTES ) { + printk("reiserfs_resize: unable to resize a reiserfs without distributed bitmap (fs version < 3.5.12)\n"); + return -ENOTSUPP; + } + + /* count used bits in last bitmap block */ + block_r = SB_BLOCK_COUNT(s) - + (SB_BMAP_NR(s) - 1) * s->s_blocksize * 8; + + /* count bitmap blocks in new fs */ + bmap_nr_new = block_count_new / ( s->s_blocksize * 8 ); + block_r_new = block_count_new - bmap_nr_new * s->s_blocksize * 8; + if (block_r_new) + bmap_nr_new++; + else + block_r_new = s->s_blocksize * 8; + + /* save old values */ + block_count = SB_BLOCK_COUNT(s); + bmap_nr = SB_BMAP_NR(s); + + /* resizing of reiserfs bitmaps (journal and real), if needed */ + if (bmap_nr_new > bmap_nr) { + /* reallocate journal bitmaps */ + if (reiserfs_allocate_list_bitmaps(s, jbitmap, bmap_nr_new) < 0) { + printk("reiserfs_resize: unable to allocate memory for journal bitmaps\n"); + unlock_super(s) ; + return -ENOMEM ; + } + /* the new journal bitmaps are zero filled, now we copy in the bitmap + ** node pointers from the old journal bitmap structs, and then + ** transfer the new data structures into the journal struct. + ** + ** using the copy_size var below allows this code to work for + ** both shrinking and expanding the FS. + */ + copy_size = bmap_nr_new < bmap_nr ? bmap_nr_new : bmap_nr ; + copy_size = copy_size * sizeof(struct reiserfs_list_bitmap_node *) ; + for (i = 0 ; i < JOURNAL_NUM_BITMAPS ; i++) { + struct reiserfs_bitmap_node **node_tmp ; + jb = SB_JOURNAL(s)->j_list_bitmap + i ; + memcpy(jbitmap[i].bitmaps, jb->bitmaps, copy_size) ; + + /* just in case vfree schedules on us, copy the new + ** pointer into the journal struct before freeing the + ** old one + */ + node_tmp = jb->bitmaps ; + jb->bitmaps = jbitmap[i].bitmaps ; + vfree(node_tmp) ; + } + + /* allocate additional bitmap blocks, reallocate array of bitmap + * block pointers */ + bitmap = reiserfs_kmalloc(sizeof(struct buffer_head *) * bmap_nr_new, GFP_KERNEL, s); + if (!bitmap) { + printk("reiserfs_resize: unable to allocate memory.\n"); + return -ENOMEM; + } + for (i = 0; i < bmap_nr; i++) + bitmap[i] = SB_AP_BITMAP(s)[i]; + for (i = bmap_nr; i < bmap_nr_new; i++) { + bitmap[i] = reiserfs_getblk(s->s_dev, i * s->s_blocksize * 8, s->s_blocksize); + memset(bitmap[i]->b_data, 0, sb->s_blocksize); + reiserfs_test_and_set_le_bit(0, bitmap[i]->b_data); + + mark_buffer_dirty(bitmap[i]) ; + mark_buffer_uptodate(bitmap[i], 1); + ll_rw_block(WRITE, 1, bitmap + i); + wait_on_buffer(bitmap[i]); + } + /* free old bitmap blocks array */ + reiserfs_kfree(SB_AP_BITMAP(s), + sizeof(struct buffer_head *) * bmap_nr, s); + SB_AP_BITMAP(s) = bitmap; + } + + /* begin transaction */ + journal_begin(&th, s, 10); + + /* correct last bitmap blocks in old and new disk layout */ + reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[bmap_nr - 1], 1); + for (i = block_r; i < s->s_blocksize * 8; i++) + reiserfs_test_and_clear_le_bit(i, + SB_AP_BITMAP(s)[bmap_nr - 1]->b_data); + journal_mark_dirty(&th, s, SB_AP_BITMAP(s)[bmap_nr - 1]); + + reiserfs_prepare_for_journal(s, SB_AP_BITMAP(s)[bmap_nr_new - 1], 1); + for (i = block_r_new; i < s->s_blocksize * 8; i++) + reiserfs_test_and_set_le_bit(i, + SB_AP_BITMAP(s)[bmap_nr_new - 1]->b_data); + journal_mark_dirty(&th, s, SB_AP_BITMAP(s)[bmap_nr_new - 1]); + + /* update super */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; + free_blocks = SB_FREE_BLOCKS(s); + PUT_SB_FREE_BLOCKS(s, free_blocks + (block_count_new - block_count - (bmap_nr_new - bmap_nr))); + PUT_SB_BLOCK_COUNT(s, block_count_new); + PUT_SB_BMAP_NR(s, bmap_nr_new); + s->s_dirt = 1; + + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s)); + + SB_JOURNAL(s)->j_must_wait = 1; + journal_end(&th, s, 10); + + return 0; +} + diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/stree.c linux/fs/reiserfs/stree.c --- v2.4.0/linux/fs/reiserfs/stree.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/stree.c Mon Jan 15 15:31:19 2001 @@ -0,0 +1,2078 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +/* + * Written by Anatoly P. Pinchuk pap@namesys.botik.ru + * Programm System Institute + * Pereslavl-Zalessky Russia + */ + +/* + * This file contains functions dealing with S+tree + * + * B_IS_IN_TREE + * copy_short_key + * copy_item_head + * comp_short_keys + * comp_keys + * comp_cpu_keys + * comp_short_le_keys + * comp_short_cpu_keys + * cpu_key2cpu_key + * le_key2cpu_key + * comp_le_keys + * bin_search + * get_lkey + * get_rkey + * key_in_buffer + * decrement_bcount + * decrement_counters_in_path + * reiserfs_check_path + * pathrelse_and_restore + * pathrelse + * search_by_key_reada + * search_by_key + * search_for_position_by_key + * comp_items + * prepare_for_direct_item + * prepare_for_direntry_item + * prepare_for_delete_or_cut + * calc_deleted_bytes_number + * init_tb_struct + * padd_item + * reiserfs_delete_item + * reiserfs_delete_solid_item + * reiserfs_delete_object + * maybe_indirect_to_direct + * indirect_to_direct_roll_back + * reiserfs_cut_from_item + * truncate_directory + * reiserfs_do_truncate + * reiserfs_paste_into_item + * reiserfs_insert_item + */ +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include +#include + +#else + +#include "nokernel.h" + +#endif + + + +/* Does the buffer contain a disk block which is in the tree. */ +inline int B_IS_IN_TREE (struct buffer_head * p_s_bh) +{ + +#ifdef CONFIG_REISERFS_CHECK + + if ( B_LEVEL (p_s_bh) > MAX_HEIGHT ) { + reiserfs_panic(0, "PAP-1010: B_IS_IN_TREE: block (%b) has too big level (%z)", + p_s_bh, p_s_bh); + } +#endif + + return ( B_LEVEL (p_s_bh) != FREE_LEVEL ); +} + + + + +inline void copy_short_key (void * to, void * from) +{ + memcpy (to, from, SHORT_KEY_SIZE); +} + +// +// to gets item head in le form +// +inline void copy_item_head(void * p_v_to, void * p_v_from) +{ + memcpy (p_v_to, p_v_from, IH_SIZE); +} + + +/* k1 is pointer to on-disk structure which is stored in little-endian + form. k2 is pointer to cpu variable. For key of items of the same + object this returns 0. + Returns: -1 if key1 < key2 + 0 if key1 == key2 + 1 if key1 > key2 */ +inline int comp_short_keys (struct key * le_key, struct cpu_key * cpu_key) +{ + __u32 * p_s_le_u32, * p_s_cpu_u32; + int n_key_length = REISERFS_SHORT_KEY_LEN; + + p_s_le_u32 = (__u32 *)le_key; + p_s_cpu_u32 = (__u32 *)cpu_key; + for( ; n_key_length--; ++p_s_le_u32, ++p_s_cpu_u32 ) { + if ( le32_to_cpu (*p_s_le_u32) < *p_s_cpu_u32 ) + return -1; + if ( le32_to_cpu (*p_s_le_u32) > *p_s_cpu_u32 ) + return 1; + } + + return 0; +} + + +/* k1 is pointer to on-disk structure which is stored in little-endian + form. k2 is pointer to cpu variable. + Compare keys using all 4 key fields. + Returns: -1 if key1 < key2 0 + if key1 = key2 1 if key1 > key2 */ +inline int comp_keys (struct key * le_key, struct cpu_key * cpu_key) +{ + int retval; + + retval = comp_short_keys (le_key, cpu_key); + if (retval) + return retval; + if (le_key_k_offset (cpu_key->version, le_key) < cpu_key_k_offset (cpu_key)) + return -1; + if (le_key_k_offset (cpu_key->version, le_key) > cpu_key_k_offset (cpu_key)) + return 1; + + if (cpu_key->key_length == 3) + return 0; + + /* this part is needed only when tail conversion is in progress */ + if (le_key_k_type (cpu_key->version, le_key) < cpu_key_k_type (cpu_key)) + return -1; + + if (le_key_k_type (cpu_key->version, le_key) > cpu_key_k_type (cpu_key)) + return 1; + + return 0; +} + + +// +// FIXME: not used yet +// +inline int comp_cpu_keys (struct cpu_key * key1, struct cpu_key * key2) +{ + if (key1->on_disk_key.k_dir_id < key2->on_disk_key.k_dir_id) + return -1; + if (key1->on_disk_key.k_dir_id > key2->on_disk_key.k_dir_id) + return 1; + + if (key1->on_disk_key.k_objectid < key2->on_disk_key.k_objectid) + return -1; + if (key1->on_disk_key.k_objectid > key2->on_disk_key.k_objectid) + return 1; + + if (cpu_key_k_offset (key1) < cpu_key_k_offset (key2)) + return -1; + if (cpu_key_k_offset (key1) > cpu_key_k_offset (key2)) + return 1; + + reiserfs_warning ("comp_cpu_keys: type are compared for %k and %k\n", + key1, key2); + + if (cpu_key_k_type (key1) < cpu_key_k_type (key2)) + return -1; + if (cpu_key_k_type (key1) > cpu_key_k_type (key2)) + return 1; + return 0; +} + +inline int comp_short_le_keys (struct key * key1, struct key * key2) +{ + __u32 * p_s_1_u32, * p_s_2_u32; + int n_key_length = REISERFS_SHORT_KEY_LEN; + + p_s_1_u32 = (__u32 *)key1; + p_s_2_u32 = (__u32 *)key2; + for( ; n_key_length--; ++p_s_1_u32, ++p_s_2_u32 ) { + if ( le32_to_cpu (*p_s_1_u32) < le32_to_cpu (*p_s_2_u32) ) + return -1; + if ( le32_to_cpu (*p_s_1_u32) > le32_to_cpu (*p_s_2_u32) ) + return 1; + } + return 0; +} + +inline int comp_short_cpu_keys (struct cpu_key * key1, + struct cpu_key * key2) +{ + __u32 * p_s_1_u32, * p_s_2_u32; + int n_key_length = REISERFS_SHORT_KEY_LEN; + + p_s_1_u32 = (__u32 *)key1; + p_s_2_u32 = (__u32 *)key2; + + for( ; n_key_length--; ++p_s_1_u32, ++p_s_2_u32 ) { + if ( *p_s_1_u32 < *p_s_2_u32 ) + return -1; + if ( *p_s_1_u32 > *p_s_2_u32 ) + return 1; + } + return 0; +} + + + +inline void cpu_key2cpu_key (struct cpu_key * to, struct cpu_key * from) +{ + memcpy (to, from, sizeof (struct cpu_key)); +} + + +inline void le_key2cpu_key (struct cpu_key * to, struct key * from) +{ + to->on_disk_key.k_dir_id = le32_to_cpu (from->k_dir_id); + to->on_disk_key.k_objectid = le32_to_cpu (from->k_objectid); + + // find out version of the key + to->version = le_key_version (from); + if (to->version == ITEM_VERSION_1) { + to->on_disk_key.u.k_offset_v1.k_offset = le32_to_cpu (from->u.k_offset_v1.k_offset); + to->on_disk_key.u.k_offset_v1.k_uniqueness = le32_to_cpu (from->u.k_offset_v1.k_uniqueness); + } else { + to->on_disk_key.u.k_offset_v2.k_offset = le64_to_cpu (from->u.k_offset_v2.k_offset); + to->on_disk_key.u.k_offset_v2.k_type = le16_to_cpu (from->u.k_offset_v2.k_type); + } +} + + + +// this does not say which one is bigger, it only returns 1 if keys +// are not equal, 0 otherwise +inline int comp_le_keys (struct key * k1, struct key * k2) +{ + return memcmp (k1, k2, sizeof (struct key)); +} + +/************************************************************************** + * Binary search toolkit function * + * Search for an item in the array by the item key * + * Returns: 1 if found, 0 if not found; * + * *p_n_pos = number of the searched element if found, else the * + * number of the first element that is larger than p_v_key. * + **************************************************************************/ +/* For those not familiar with binary search: n_lbound is the leftmost item that it + could be, n_rbound the rightmost item that it could be. We examine the item + halfway between n_lbound and n_rbound, and that tells us either that we can increase + n_lbound, or decrease n_rbound, or that we have found it, or if n_lbound <= n_rbound that + there are no possible items, and we have not found it. With each examination we + cut the number of possible items it could be by one more than half rounded down, + or we find it. */ +inline int bin_search ( + void * p_v_key, /* Key to search for. */ + void * p_v_base, /* First item in the array. */ + int p_n_num, /* Number of items in the array. */ + int p_n_width, /* Item size in the array. + searched. Lest the reader be + confused, note that this is crafted + as a general function, and when it + is applied specifically to the array + of item headers in a node, p_n_width + is actually the item header size not + the item size. */ + int * p_n_pos /* Number of the searched for element. */ + ) { + int n_rbound, n_lbound, n_j; + + for ( n_j = ((n_rbound = p_n_num - 1) + (n_lbound = 0))/2; n_lbound <= n_rbound; n_j = (n_rbound + n_lbound)/2 ) + switch( COMP_KEYS((struct key *)((char * )p_v_base + n_j * p_n_width), (struct cpu_key *)p_v_key) ) { + case -1: n_lbound = n_j + 1; continue; + case 1: n_rbound = n_j - 1; continue; + case 0: *p_n_pos = n_j; return ITEM_FOUND; /* Key found in the array. */ + } + + /* bin_search did not find given key, it returns position of key, + that is minimal and greater than the given one. */ + *p_n_pos = n_lbound; + return ITEM_NOT_FOUND; +} + +#ifdef CONFIG_REISERFS_CHECK +extern struct tree_balance * cur_tb; +#endif + + + +/* Minimal possible key. It is never in the tree. */ +struct key MIN_KEY = {0, 0, {{0, 0},}}; + +/* Maximal possible key. It is never in the tree. */ +struct key MAX_KEY = {0xffffffff, 0xffffffff, {{0xffffffff, 0xffffffff},}}; + + +/* Get delimiting key of the buffer by looking for it in the buffers in the path, starting from the bottom + of the path, and going upwards. We must check the path's validity at each step. If the key is not in + the path, there is no delimiting key in the tree (buffer is first or last buffer in tree), and in this + case we return a special key, either MIN_KEY or MAX_KEY. */ +inline struct key * get_lkey ( + struct path * p_s_chk_path, + struct super_block * p_s_sb + ) { + int n_position, n_path_offset = p_s_chk_path->path_length; + struct buffer_head * p_s_parent; + +#ifdef CONFIG_REISERFS_CHECK + if ( n_path_offset < FIRST_PATH_ELEMENT_OFFSET ) + reiserfs_panic(p_s_sb,"PAP-5010: get_lkey: illegal offset in the path"); +#endif + + /* While not higher in path than first element. */ + while ( n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET ) { + +#ifdef CONFIG_REISERFS_CHECK + if ( ! buffer_uptodate(PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)) ) + reiserfs_panic(p_s_sb, "PAP-5020: get_lkey: parent is not uptodate"); +#endif + + /* Parent at the path is not in the tree now. */ + if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)) ) + return &MAX_KEY; + /* Check whether position in the parent is correct. */ + if ( (n_position = PATH_OFFSET_POSITION(p_s_chk_path, n_path_offset)) > B_NR_ITEMS(p_s_parent) ) + return &MAX_KEY; + /* Check whether parent at the path really points to the child. */ + if ( B_N_CHILD_NUM(p_s_parent, n_position) != + PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset + 1)->b_blocknr ) + return &MAX_KEY; + /* Return delimiting key if position in the parent is not equal to zero. */ + if ( n_position ) + return B_N_PDELIM_KEY(p_s_parent, n_position - 1); + } + /* Return MIN_KEY if we are in the root of the buffer tree. */ + if ( PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == + SB_ROOT_BLOCK (p_s_sb) ) + return &MIN_KEY; + return &MAX_KEY; +} + + +/* Get delimiting key of the buffer at the path and its right neighbor. */ +inline struct key * get_rkey ( + struct path * p_s_chk_path, + struct super_block * p_s_sb + ) { + int n_position, + n_path_offset = p_s_chk_path->path_length; + struct buffer_head * p_s_parent; + +#ifdef CONFIG_REISERFS_CHECK + if ( n_path_offset < FIRST_PATH_ELEMENT_OFFSET ) + reiserfs_panic(p_s_sb,"PAP-5030: get_rkey: illegal offset in the path"); +#endif + + while ( n_path_offset-- > FIRST_PATH_ELEMENT_OFFSET ) { + +#ifdef CONFIG_REISERFS_CHECK + if ( ! buffer_uptodate(PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)) ) + reiserfs_panic(p_s_sb, "PAP-5040: get_rkey: parent is not uptodate"); +#endif + + /* Parent at the path is not in the tree now. */ + if ( ! B_IS_IN_TREE(p_s_parent = PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset)) ) + return &MIN_KEY; + /* Check whether position in the parrent is correct. */ + if ( (n_position = PATH_OFFSET_POSITION(p_s_chk_path, n_path_offset)) > B_NR_ITEMS(p_s_parent) ) + return &MIN_KEY; + /* Check whether parent at the path really points to the child. */ + if ( B_N_CHILD_NUM(p_s_parent, n_position) != + PATH_OFFSET_PBUFFER(p_s_chk_path, n_path_offset + 1)->b_blocknr ) + return &MIN_KEY; + /* Return delimiting key if position in the parent is not the last one. */ + if ( n_position != B_NR_ITEMS(p_s_parent) ) + return B_N_PDELIM_KEY(p_s_parent, n_position); + } + /* Return MAX_KEY if we are in the root of the buffer tree. */ + if ( PATH_OFFSET_PBUFFER(p_s_chk_path, FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == + SB_ROOT_BLOCK (p_s_sb) ) + return &MAX_KEY; + return &MIN_KEY; +} + + +/* Check whether a key is contained in the tree rooted from a buffer at a path. */ +/* This works by looking at the left and right delimiting keys for the buffer in the last path_element in + the path. These delimiting keys are stored at least one level above that buffer in the tree. If the + buffer is the first or last node in the tree order then one of the delimiting keys may be absent, and in + this case get_lkey and get_rkey return a special key which is MIN_KEY or MAX_KEY. */ +static inline int key_in_buffer ( + struct path * p_s_chk_path, /* Path which should be checked. */ + struct cpu_key * p_s_key, /* Key which should be checked. */ + struct super_block * p_s_sb /* Super block pointer. */ + ) { + +#ifdef CONFIG_REISERFS_CHECK + if ( ! p_s_key || p_s_chk_path->path_length < FIRST_PATH_ELEMENT_OFFSET || + p_s_chk_path->path_length > MAX_HEIGHT ) + reiserfs_panic(p_s_sb, "PAP-5050: key_in_buffer: pointer to the key(%p) is NULL or illegal path length(%d)", + p_s_key, p_s_chk_path->path_length); + + if ( PATH_PLAST_BUFFER(p_s_chk_path)->b_dev == NODEV ) + reiserfs_panic(p_s_sb, "PAP-5060: key_in_buffer: device must not be NODEV"); +#endif + + if ( COMP_KEYS(get_lkey(p_s_chk_path, p_s_sb), p_s_key) == 1 ) + /* left delimiting key is bigger, that the key we look for */ + return 0; + // if ( COMP_KEYS(p_s_key, get_rkey(p_s_chk_path, p_s_sb)) != -1 ) + if ( COMP_KEYS(get_rkey(p_s_chk_path, p_s_sb), p_s_key) != 1 ) + /* p_s_key must be less than right delimitiing key */ + return 0; + return 1; +} + + +inline void decrement_bcount( + struct buffer_head * p_s_bh + ) { + if ( p_s_bh ) { + if ( atomic_read (&(p_s_bh->b_count)) ) { + atomic_dec (&(p_s_bh->b_count)); + return; + } + reiserfs_panic(NULL, "PAP-5070: decrement_bcount: trying to free free buffer %b", p_s_bh); + } +} + + +/* Decrement b_count field of the all buffers in the path. */ +void decrement_counters_in_path ( + struct path * p_s_search_path + ) { + int n_path_offset = p_s_search_path->path_length; + +#ifdef CONFIG_REISERFS_CHECK + if ( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET || + n_path_offset > EXTENDED_MAX_HEIGHT - 1 ) + reiserfs_panic(NULL, "PAP-5080: decrement_counters_in_path: illegal path offset of %d", n_path_offset); +#endif + + while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET ) { + struct buffer_head * bh; + + bh = PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--); + decrement_bcount (bh); + } + p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; +} + + +int reiserfs_check_path(struct path *p) { +#ifdef CONFIG_REISERFS_CHECK + if (p->path_length != ILLEGAL_PATH_ELEMENT_OFFSET) { + reiserfs_warning("check_path, path not properly relsed\n") ; + BUG() ; + } +#endif + return 0 ; +} + + +/* Release all buffers in the path. Restore dirty bits clean +** when preparing the buffer for the log +** +** only called from fix_nodes() +*/ +void pathrelse_and_restore ( + struct super_block *s, + struct path * p_s_search_path + ) { + int n_path_offset = p_s_search_path->path_length; + +#ifdef CONFIG_REISERFS_CHECK + if ( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET ) + reiserfs_panic(NULL, "clm-4000: pathrelse: illegal path offset"); +#endif + + while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET ) { + reiserfs_restore_prepared_buffer(s, PATH_OFFSET_PBUFFER(p_s_search_path, + n_path_offset)); + brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--)); + } + p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; +} + +/* Release all buffers in the path. */ +void pathrelse ( + struct path * p_s_search_path + ) { + int n_path_offset = p_s_search_path->path_length; + +#ifdef CONFIG_REISERFS_CHECK + if ( n_path_offset < ILLEGAL_PATH_ELEMENT_OFFSET ) + reiserfs_panic(NULL, "PAP-5090: pathrelse: illegal path offset"); +#endif + + while ( n_path_offset > ILLEGAL_PATH_ELEMENT_OFFSET ) + brelse(PATH_OFFSET_PBUFFER(p_s_search_path, n_path_offset--)); + + p_s_search_path->path_length = ILLEGAL_PATH_ELEMENT_OFFSET; +} + + + +static int is_leaf (char * buf, int blocksize, struct buffer_head * bh) +{ + struct block_head * blkh; + struct item_head * ih; + int used_space; + int prev_location; + int i; + int nr; + + blkh = (struct block_head *)buf; + if (le16_to_cpu (blkh->blk_level) != DISK_LEAF_NODE_LEVEL) { + printk ("is_leaf: this should be caught earlier\n"); + return 0; + } + + nr = le16_to_cpu (blkh->blk_nr_item); + if (nr < 1 || nr > ((blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN))) { + /* item number is too big or too small */ + reiserfs_warning ("is_leaf: nr_item seems wrong: %z\n", bh); + return 0; + } + ih = (struct item_head *)(buf + BLKH_SIZE) + nr - 1; + used_space = BLKH_SIZE + IH_SIZE * nr + (blocksize - ih_location (ih)); + if (used_space != blocksize - le16_to_cpu (blkh->blk_free_space)) { + /* free space does not match to calculated amount of use space */ + reiserfs_warning ("is_leaf: free space seems wrong: %z\n", bh); + return 0; + } + + // FIXME: it is_leaf will hit performance too much - we may have + // return 1 here + + /* check tables of item heads */ + ih = (struct item_head *)(buf + BLKH_SIZE); + prev_location = blocksize; + for (i = 0; i < nr; i ++, ih ++) { + if (ih_location (ih) >= blocksize || ih_location (ih) < IH_SIZE * nr) { + reiserfs_warning ("is_leaf: item location seems wrong: %h\n", ih); + return 0; + } + if (ih_item_len (ih) < 1 || ih_item_len (ih) > MAX_ITEM_LEN (blocksize)) { + reiserfs_warning ("is_leaf: item length seems wrong: %h\n", ih); + return 0; + } + if (prev_location - ih_location (ih) != ih_item_len (ih)) { + reiserfs_warning ("is_leaf: item location seems wrong (second one): %h\n", ih); + return 0; + } + prev_location = ih_location (ih); + } + + // one may imagine much more checks + return 1; +} + + +/* returns 1 if buf looks like an internal node, 0 otherwise */ +static int is_internal (char * buf, int blocksize, struct buffer_head * bh) +{ + struct block_head * blkh; + int nr; + int used_space; + + blkh = (struct block_head *)buf; + if (le16_to_cpu (blkh->blk_level) <= DISK_LEAF_NODE_LEVEL || + le16_to_cpu (blkh->blk_level) > MAX_HEIGHT) { + /* this level is not possible for internal nodes */ + printk ("is_internal: this should be caught earlier\n"); + return 0; + } + + nr = le16_to_cpu (blkh->blk_nr_item); + if (nr > (blocksize - BLKH_SIZE - DC_SIZE) / (KEY_SIZE + DC_SIZE)) { + /* for internal which is not root we might check min number of keys */ + reiserfs_warning ("is_internal: number of key seems wrong: %z\n", bh); + return 0; + } + + used_space = BLKH_SIZE + KEY_SIZE * nr + DC_SIZE * (nr + 1); + if (used_space != blocksize - le16_to_cpu (blkh->blk_free_space)) { + reiserfs_warning ("is_internal: free space seems wrong: %z\n", bh); + return 0; + } + + // one may imagine much more checks + return 1; +} + + +// make sure that bh contains formatted node of reiserfs tree of +// 'level'-th level +static int is_tree_node (struct buffer_head * bh, int level) +{ + if (B_LEVEL (bh) != level) { + printk ("is_tree_node: node level %d does not match to the expected one %d\n", + B_LEVEL (bh), level); + return 0; + } + if (level == DISK_LEAF_NODE_LEVEL) + return is_leaf (bh->b_data, bh->b_size, bh); + + return is_internal (bh->b_data, bh->b_size, bh); +} + + + +#ifdef SEARCH_BY_KEY_READA + +/* The function is NOT SCHEDULE-SAFE! */ +static void search_by_key_reada (struct super_block * s, int blocknr) +{ + struct buffer_head * bh; + + if (blocknr == 0) + return; + + bh = reiserfs_getblk (s->s_dev, blocknr, s->s_blocksize); + + if (!buffer_uptodate (bh)) { + ll_rw_block (READA, 1, &bh); + } + bh->b_count --; +} + +#endif + +/************************************************************************** + * Algorithm SearchByKey * + * look for item in the Disk S+Tree by its key * + * Input: p_s_sb - super block * + * p_s_key - pointer to the key to search * + * Output: ITEM_FOUND, ITEM_NOT_FOUND or IO_ERROR * + * p_s_search_path - path from the root to the needed leaf * + **************************************************************************/ + +/* This function fills up the path from the root to the leaf as it + descends the tree looking for the key. It uses reiserfs_bread to + try to find buffers in the cache given their block number. If it + does not find them in the cache it reads them from disk. For each + node search_by_key finds using reiserfs_bread it then uses + bin_search to look through that node. bin_search will find the + position of the block_number of the next node if it is looking + through an internal node. If it is looking through a leaf node + bin_search will find the position of the item which has key either + equal to given key, or which is the maximal key less than the given + key. search_by_key returns a path that must be checked for the + correctness of the top of the path but need not be checked for the + correctness of the bottom of the path */ +/* The function is NOT SCHEDULE-SAFE! */ +int search_by_key (struct super_block * p_s_sb, + struct cpu_key * p_s_key, /* Key to search. */ + struct path * p_s_search_path, /* This structure was + allocated and initialized + by the calling + function. It is filled up + by this function. */ + int n_stop_level /* How far down the tree to search. To + stop at leaf level - set to + DISK_LEAF_NODE_LEVEL */ + ) { + kdev_t n_dev = p_s_sb->s_dev; + int n_block_number = SB_ROOT_BLOCK (p_s_sb), + expected_level = SB_TREE_HEIGHT (p_s_sb), + n_block_size = p_s_sb->s_blocksize; + struct buffer_head * p_s_bh; + struct path_element * p_s_last_element; + int n_node_level, n_retval; + int right_neighbor_of_leaf_node; + int fs_gen; + +#ifdef CONFIG_REISERFS_CHECK + int n_repeat_counter = 0; +#endif + + /* As we add each node to a path we increase its count. This means that + we must be careful to release all nodes in a path before we either + discard the path struct or re-use the path struct, as we do here. */ + + decrement_counters_in_path(p_s_search_path); + + right_neighbor_of_leaf_node = 0; + + /* With each iteration of this loop we search through the items in the + current node, and calculate the next current node(next path element) + for the next iteration of this loop.. */ + while ( 1 ) { + +#ifdef CONFIG_REISERFS_CHECK + if ( !(++n_repeat_counter % 50000) ) + reiserfs_warning ("PAP-5100: search_by_key: %s:" + "there were %d iterations of while loop " + "looking for key %K\n", + current->comm, n_repeat_counter, p_s_key); +#endif + + /* prep path to have another element added to it. */ + p_s_last_element = PATH_OFFSET_PELEMENT(p_s_search_path, ++p_s_search_path->path_length); + fs_gen = get_generation (p_s_sb); + expected_level --; + +#ifdef SEARCH_BY_KEY_READA + /* schedule read of right neighbor */ + search_by_key_reada (p_s_sb, right_neighbor_of_leaf_node); +#endif + + /* Read the next tree node, and set the last element in the path to + have a pointer to it. */ + if ( ! (p_s_bh = p_s_last_element->pe_buffer = + reiserfs_bread(n_dev, n_block_number, n_block_size)) ) { + p_s_search_path->path_length --; + pathrelse(p_s_search_path); + return IO_ERROR; + } + + /* It is possible that schedule occured. We must check whether the key + to search is still in the tree rooted from the current buffer. If + not then repeat search from the root. */ + if ( fs_changed (fs_gen, p_s_sb) && + (!B_IS_IN_TREE (p_s_bh) || !key_in_buffer(p_s_search_path, p_s_key, p_s_sb)) ) { + decrement_counters_in_path(p_s_search_path); + + /* Get the root block number so that we can repeat the search + starting from the root. */ + n_block_number = SB_ROOT_BLOCK (p_s_sb); + expected_level = SB_TREE_HEIGHT (p_s_sb); + right_neighbor_of_leaf_node = 0; + + /* repeat search from the root */ + continue; + } + +#ifdef CONFIG_REISERFS_CHECK + + if ( ! key_in_buffer(p_s_search_path, p_s_key, p_s_sb) ) + reiserfs_panic(p_s_sb, "PAP-5130: search_by_key: key is not in the buffer"); + if ( cur_tb ) { + print_cur_tb ("5140"); + reiserfs_panic(p_s_sb, "PAP-5140: search_by_key: schedule occurred in do_balance!"); + } + +#endif + + // make sure, that the node contents look like a node of + // certain level + if (!is_tree_node (p_s_bh, expected_level)) { + reiserfs_warning ("vs-5150: search_by_key: " + "invalid format found in block %d. Fsck?\n", p_s_bh->b_blocknr); + pathrelse (p_s_search_path); + return IO_ERROR; + } + + /* ok, we have acquired next formatted node in the tree */ + n_node_level = B_LEVEL (p_s_bh); + +#ifdef CONFIG_REISERFS_CHECK + + if (n_node_level < n_stop_level) + reiserfs_panic (p_s_sb, "vs-5152: search_by_key: tree level is less than stop level (%d)", + n_node_level, n_stop_level); + +#endif + + n_retval = bin_search (p_s_key, B_N_PITEM_HEAD(p_s_bh, 0), B_NR_ITEMS(p_s_bh), + ( n_node_level == DISK_LEAF_NODE_LEVEL ) ? IH_SIZE : KEY_SIZE, &(p_s_last_element->pe_position)); + if (n_node_level == n_stop_level) { + return n_retval; + } + + /* we are not in the stop level */ + if (n_retval == ITEM_FOUND) + /* item has been found, so we choose the pointer which is to the right of the found one */ + p_s_last_element->pe_position++; + + /* if item was not found we choose the position which is to + the left of the found item. This requires no code, + bin_search did it already.*/ + + /* So we have chosen a position in the current node which is + an internal node. Now we calculate child block number by + position in the node. */ + n_block_number = B_N_CHILD_NUM(p_s_bh, p_s_last_element->pe_position); + +#ifdef SEARCH_BY_KEY_READA + /* if we are going to read leaf node, then calculate its right neighbor if possible */ + if (n_node_level == DISK_LEAF_NODE_LEVEL + 1 && p_s_last_element->pe_position < B_NR_ITEMS (p_s_bh)) + right_neighbor_of_leaf_node = B_N_CHILD_NUM(p_s_bh, p_s_last_element->pe_position + 1); +#endif + } +} + + +/* Form the path to an item and position in this item which contains + file byte defined by p_s_key. If there is no such item + corresponding to the key, we point the path to the item with + maximal key less than p_s_key, and *p_n_pos_in_item is set to one + past the last entry/byte in the item. If searching for entry in a + directory item, and it is not found, *p_n_pos_in_item is set to one + entry more than the entry with maximal key which is less than the + sought key. + + Note that if there is no entry in this same node which is one more, + then we point to an imaginary entry. for direct items, the + position is in units of bytes, for indirect items the position is + in units of blocknr entries, for directory items the position is in + units of directory entries. */ + +/* The function is NOT SCHEDULE-SAFE! */ +int search_for_position_by_key (struct super_block * p_s_sb, /* Pointer to the super block. */ + struct cpu_key * p_cpu_key, /* Key to search (cpu variable) */ + struct path * p_s_search_path /* Filled up by this function. */ + ) { + struct item_head * p_le_ih; /* pointer to on-disk structure */ + int n_blk_size; + loff_t item_offset, offset; + struct reiserfs_dir_entry de; + int retval; + + /* If searching for directory entry. */ + if ( is_direntry_cpu_key (p_cpu_key) ) + return search_by_entry_key (p_s_sb, p_cpu_key, p_s_search_path, &de); + + /* If not searching for directory entry. */ + + /* If item is found. */ + retval = search_item (p_s_sb, p_cpu_key, p_s_search_path); + if (retval == IO_ERROR) + return retval; + if ( retval == ITEM_FOUND ) { + +#ifdef CONFIG_REISERFS_CHECK + if ( ! B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path), + PATH_LAST_POSITION(p_s_search_path))->ih_item_len ) + reiserfs_panic(p_s_sb, "PAP-5165: search_for_position_by_key: item length equals zero"); +#endif + + pos_in_item(p_s_search_path) = 0; + return POSITION_FOUND; + } + +#ifdef CONFIG_REISERFS_CHECK + if ( ! PATH_LAST_POSITION(p_s_search_path) ) + reiserfs_panic(p_s_sb, "PAP-5170: search_for_position_by_key: position equals zero"); +#endif + + /* Item is not found. Set path to the previous item. */ + p_le_ih = B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_search_path), --PATH_LAST_POSITION(p_s_search_path)); + n_blk_size = p_s_sb->s_blocksize; + + if (comp_short_keys (&(p_le_ih->ih_key), p_cpu_key)) { + return FILE_NOT_FOUND; + } + +#if 0 +/*#ifdef CONFIG_REISERFS_CHECK*/ + + /* we expect to find stat data or item of the same type */ + if ( ! is_statdata_le_ih(p_le_ih) && ((is_indirect_cpu_key(p_cpu_key) && ! is_indirect_le_ih(p_le_ih)) || + (is_direct_cpu_key(p_cpu_key) && ! is_direct_le_ih(p_le_ih))) ) { + print_block (PATH_PLAST_BUFFER(p_s_search_path), PRINT_LEAF_ITEMS, + PATH_LAST_POSITION (p_s_search_path) - 2, + PATH_LAST_POSITION (p_s_search_path) + 2); + reiserfs_panic(p_s_sb, "PAP-5190: search_for_position_by_key: " + "found item %h type does not match to the expected one %k", + p_le_ih, p_cpu_key); + } +/*#endif*/ +#endif + + // FIXME: quite ugly this far + + item_offset = le_ih_k_offset (p_le_ih); + offset = cpu_key_k_offset (p_cpu_key); + + /* Needed byte is contained in the item pointed to by the path.*/ + if (item_offset <= offset && + item_offset + op_bytes_number (p_le_ih, n_blk_size) > offset) { + pos_in_item (p_s_search_path) = offset - item_offset; + if ( is_indirect_le_ih(p_le_ih) ) { + pos_in_item (p_s_search_path) /= n_blk_size; + } + return POSITION_FOUND; + } + + /* Needed byte is not contained in the item pointed to by the + path. Set pos_in_item out of the item. */ + if ( is_indirect_le_ih (p_le_ih) ) + pos_in_item (p_s_search_path) = le16_to_cpu (p_le_ih->ih_item_len) / UNFM_P_SIZE; + else + pos_in_item (p_s_search_path) = le16_to_cpu (p_le_ih->ih_item_len); + + return POSITION_NOT_FOUND; +} + + +/* Compare given item and item pointed to by the path. */ +int comp_items (struct item_head * stored_ih, struct path * p_s_path) +{ + struct buffer_head * p_s_bh; + struct item_head * ih; + + /* Last buffer at the path is not in the tree. */ + if ( ! B_IS_IN_TREE(p_s_bh = PATH_PLAST_BUFFER(p_s_path)) ) + return 1; + + /* Last path position is invalid. */ + if ( PATH_LAST_POSITION(p_s_path) >= B_NR_ITEMS(p_s_bh) ) + return 1; + + /* we need only to know, whether it is the same item */ + ih = get_ih (p_s_path); + return memcmp (stored_ih, ih, IH_SIZE); + +#if 0 + /* Get item at the path. */ + p_s_path_item = PATH_PITEM_HEAD(p_s_path); + /* Compare keys. */ + if ( COMP_KEYS(&(p_s_path_item->ih_key), &(p_cpu_ih->ih_key)) ) + return 1; + + /* Compare other items fields. */ + if ( le16_to_cpu (p_s_path_item->u.ih_entry_count) != p_cpu_ih->u.ih_entry_count || + le16_to_cpu (p_s_path_item->ih_item_len) != p_cpu_ih->ih_item_len || + le16_to_cpu ( p_s_path_item->ih_item_location) != p_cpu_ih->ih_item_location ) + return 1; + + /* Items are equal. */ + return 0; +#endif +} + + +/* unformatted nodes are not logged anymore, ever. This is safe +** now +*/ +#define held_by_others(bh) (atomic_read(&(bh)->b_count) > 1) + +// block can not be forgotten as it is in I/O or held by someone +#define block_in_use(bh) (buffer_locked(bh) || (held_by_others(bh))) + + + +// prepare for delete or cut of direct item +static inline int prepare_for_direct_item (struct path * path, + struct item_head * le_ih, + struct inode * inode, + loff_t new_file_length, + int * cut_size) +{ + loff_t round_len; + + + if ( new_file_length == max_reiserfs_offset (inode) ) { + /* item has to be deleted */ + *cut_size = -(IH_SIZE + le16_to_cpu (le_ih->ih_item_len)); + return M_DELETE; + } + + // new file gets truncated + if (inode_items_version (inode) == ITEM_VERSION_2) { + // + round_len = ROUND_UP (new_file_length); + /* this was n_new_file_length < le_ih ... */ + if ( round_len < le_ih_k_offset (le_ih) ) { + *cut_size = -(IH_SIZE + le16_to_cpu (le_ih->ih_item_len)); + return M_DELETE; /* Delete this item. */ + } + /* Calculate first position and size for cutting from item. */ + pos_in_item (path) = round_len - (le_ih_k_offset (le_ih) - 1); + *cut_size = -(le16_to_cpu (le_ih->ih_item_len) - pos_in_item(path)); + + return M_CUT; /* Cut from this item. */ + } + + + // old file: items may have any length + + if ( new_file_length < le_ih_k_offset (le_ih) ) { + *cut_size = -(IH_SIZE + le16_to_cpu (le_ih->ih_item_len)); + return M_DELETE; /* Delete this item. */ + } + /* Calculate first position and size for cutting from item. */ + *cut_size = -(le16_to_cpu (le_ih->ih_item_len) - + (pos_in_item (path) = new_file_length + 1 - le_ih_k_offset (le_ih))); + return M_CUT; /* Cut from this item. */ +} + + +static inline int prepare_for_direntry_item (struct path * path, + struct item_head * le_ih, + struct inode * inode, + loff_t new_file_length, + int * cut_size) +{ + if (le_ih_k_offset (le_ih) == DOT_OFFSET && + new_file_length == max_reiserfs_offset (inode)) { +#ifdef CONFIG_REISERFS_CHECK + if (ih_entry_count (le_ih) != 2) + reiserfs_panic(inode->i_sb,"PAP-5220: prepare_for_delete_or_cut: " + "incorrect empty directory item (%h)", le_ih); +#endif + *cut_size = -(IH_SIZE + le16_to_cpu (le_ih->ih_item_len)); + return M_DELETE; /* Delete the directory item containing "." and ".." entry. */ + } + + if ( ih_entry_count (le_ih) == 1 ) { + /* Delete the directory item such as there is one record only + in this item*/ + *cut_size = -(IH_SIZE + le16_to_cpu (le_ih->ih_item_len)); + return M_DELETE; + } + + /* Cut one record from the directory item. */ + *cut_size = -(DEH_SIZE + entry_length (get_bh (path), le_ih, pos_in_item (path))); + return M_CUT; +} + + +/* If the path points to a directory or direct item, calculate mode and the size cut, for balance. + If the path points to an indirect item, remove some number of its unformatted nodes. + In case of file truncate calculate whether this item must be deleted/truncated or last + unformatted node of this item will be converted to a direct item. + This function returns a determination of what balance mode the calling function should employ. */ +static char prepare_for_delete_or_cut( + struct reiserfs_transaction_handle *th, + struct inode * inode, + struct path * p_s_path, + struct cpu_key * p_s_item_key, + int * p_n_removed, /* Number of unformatted nodes which were removed + from end of the file. */ + int * p_n_cut_size, + unsigned long long n_new_file_length /* MAX_KEY_OFFSET in case of delete. */ + ) { + struct super_block * p_s_sb = inode->i_sb; + struct item_head * p_le_ih = PATH_PITEM_HEAD(p_s_path); + struct buffer_head * p_s_bh = PATH_PLAST_BUFFER(p_s_path); + +#ifdef CONFIG_REISERFS_CHECK + int n_repeat_counter = 0; +#endif + + /* Stat_data item. */ + if ( is_statdata_le_ih (p_le_ih) ) { + +#ifdef CONFIG_REISERFS_CHECK + if ( n_new_file_length != max_reiserfs_offset (inode) ) + reiserfs_panic(p_s_sb, "PAP-5210: prepare_for_delete_or_cut: mode must be M_DELETE"); +#endif + + *p_n_cut_size = -(IH_SIZE + le16_to_cpu (p_le_ih->ih_item_len)); + return M_DELETE; + } + + + /* Directory item. */ + if ( is_direntry_le_ih (p_le_ih) ) + return prepare_for_direntry_item (p_s_path, p_le_ih, inode, n_new_file_length, p_n_cut_size); + + /* Direct item. */ + if ( is_direct_le_ih (p_le_ih) ) + return prepare_for_direct_item (p_s_path, p_le_ih, inode, n_new_file_length, p_n_cut_size); + + + /* Case of an indirect item. */ + { + int n_unfm_number, /* Number of the item unformatted nodes. */ + n_counter, + n_retry, /* Set to one if there is unformatted node buffer in use. */ + n_blk_size; + __u32 * p_n_unfm_pointer; /* Pointer to the unformatted node number. */ + __u32 tmp; + struct item_head s_ih; /* Item header. */ + char c_mode; /* Returned mode of the balance. */ + struct buffer_head * p_s_un_bh; + int need_research; + + + n_blk_size = p_s_sb->s_blocksize; + + /* Search for the needed object indirect item until there are no unformatted nodes to be removed. */ + do { + need_research = 0; + p_s_bh = PATH_PLAST_BUFFER(p_s_path); + /* Copy indirect item header to a temp variable. */ + copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); + /* Calculate number of unformatted nodes in this item. */ + n_unfm_number = I_UNFM_NUM(&s_ih); + +#ifdef CONFIG_REISERFS_CHECK + if ( ! is_indirect_le_ih(&s_ih) || ! n_unfm_number || + pos_in_item (p_s_path) + 1 != n_unfm_number ) { + printk("n_unfm_number = %d *p_n_pos_in_item = %d\n",n_unfm_number, pos_in_item (p_s_path)); + reiserfs_panic(p_s_sb, "PAP-5240: prepare_for_delete_or_cut: illegal item %h", &s_ih); + } +#endif + + /* Calculate balance mode and position in the item to remove unformatted nodes. */ + if ( n_new_file_length == max_reiserfs_offset (inode) ) {/* Case of delete. */ + pos_in_item (p_s_path) = 0; + *p_n_cut_size = -(IH_SIZE + le16_to_cpu (s_ih.ih_item_len)); + c_mode = M_DELETE; + } + else { /* Case of truncate. */ + if ( n_new_file_length < le_ih_k_offset (&s_ih) ) { + pos_in_item (p_s_path) = 0; + *p_n_cut_size = -(IH_SIZE + le16_to_cpu (s_ih.ih_item_len)); + c_mode = M_DELETE; /* Delete this item. */ + } + else { + /* indirect item must be truncated starting from *p_n_pos_in_item-th position */ + pos_in_item (p_s_path) = (n_new_file_length + n_blk_size - le_ih_k_offset (&s_ih) ) >> p_s_sb->s_blocksize_bits; + +#ifdef CONFIG_REISERFS_CHECK + if ( pos_in_item (p_s_path) > n_unfm_number ) + reiserfs_panic(p_s_sb, "PAP-5250: prepare_for_delete_or_cut: illegal position in the item"); +#endif + + /* Either convert last unformatted node of indirect item to direct item or increase + its free space. */ + if ( pos_in_item (p_s_path) == n_unfm_number ) { + *p_n_cut_size = 0; /* Nothing to cut. */ + return M_CONVERT; /* Maybe convert last unformatted node to the direct item. */ + } + /* Calculate size to cut. */ + *p_n_cut_size = -(s_ih.ih_item_len - pos_in_item (p_s_path) * UNFM_P_SIZE); + + c_mode = M_CUT; /* Cut from this indirect item. */ + } + } + +#ifdef CONFIG_REISERFS_CHECK + if ( n_unfm_number <= pos_in_item (p_s_path) ) + reiserfs_panic(p_s_sb, "PAP-5260: prepare_for_delete_or_cut: illegal position in the indirect item"); +#endif + + /* pointers to be cut */ + n_unfm_number -= pos_in_item (p_s_path); + /* Set pointer to the last unformatted node pointer that is to be cut. */ + p_n_unfm_pointer = (__u32 *)B_I_PITEM(p_s_bh, &s_ih) + I_UNFM_NUM(&s_ih) - 1 - *p_n_removed; + + + /* We go through the unformatted nodes pointers of the indirect + item and look for the unformatted nodes in the cache. If we + found some of them we free it, zero corresponding indirect item + entry and log buffer containing that indirect item. For this we + need to prepare last path element for logging. If some + unformatted node has b_count > 1 we must not free this + unformatted node since it is in use. */ + reiserfs_prepare_for_journal(p_s_sb, p_s_bh, 1); + // note: path could be changed, first line in for loop takes care + // of it + + for ( n_retry = 0, n_counter = *p_n_removed; + n_counter < n_unfm_number; n_counter++, p_n_unfm_pointer-- ) { + + if (item_moved (&s_ih, p_s_path)) { + need_research = 1 ; + break; + } +#ifdef CONFIG_REISERFS_CHECK + if (p_n_unfm_pointer < (__u32 *)B_I_PITEM(p_s_bh, &s_ih) || + p_n_unfm_pointer > (__u32 *)B_I_PITEM(p_s_bh, &s_ih) + I_UNFM_NUM(&s_ih) - 1) + reiserfs_panic (p_s_sb, "vs-5265: prepare_for_delete_or_cut: pointer out of range"); +#endif + + if ( ! *p_n_unfm_pointer ) { /* Hole, nothing to remove. */ + if ( ! n_retry ) + (*p_n_removed)++; + continue; + } + /* Search for the buffer in cache. */ + p_s_un_bh = get_hash_table(p_s_sb->s_dev, *p_n_unfm_pointer, n_blk_size); + + if (p_s_un_bh && buffer_locked(p_s_un_bh)) { + __wait_on_buffer(p_s_un_bh) ; + if ( item_moved (&s_ih, p_s_path) ) { + need_research = 1; + brelse(p_s_un_bh) ; + break ; + } + } + if ( p_s_un_bh && block_in_use (p_s_un_bh)) { + /* Block is locked or held more than by one holder and by + journal. */ + +#ifndef __KERNEL__ + reiserfs_panic(p_s_sb, "PAP-5270: prepare_for_delete_or_cut: b_count != 1"); +#endif + +#ifdef CONFIG_REISERFS_CHECK + if (n_repeat_counter && (n_repeat_counter % 100000) == 0) { + printk("prepare_for_delete, waiting on buffer %lu, b_count %d, %s%cJDIRTY %cJDIRTY_WAIT\n", + p_s_un_bh->b_blocknr, atomic_read (&p_s_un_bh->b_count), + buffer_locked (p_s_un_bh) ? "locked, " : "", + buffer_journaled(p_s_un_bh) ? ' ' : '!', + buffer_journal_dirty(p_s_un_bh) ? ' ' : '!') ; + + } +#endif + n_retry = 1; + brelse (p_s_un_bh); + continue; + } + + if ( ! n_retry ) + (*p_n_removed)++; + +#ifdef CONFIG_REISERFS_CHECK + if ( p_s_un_bh && (*p_n_unfm_pointer != p_s_un_bh->b_blocknr )) + // note: minix_truncate allows that. As truncate is + // protected by down (inode->i_sem), two truncates can not + // co-exist + reiserfs_panic(p_s_sb, "PAP-5280: prepare_for_delete_or_cut: blocks numbers are different"); +#endif + + tmp = *p_n_unfm_pointer; + *p_n_unfm_pointer = 0; + journal_mark_dirty (th, p_s_sb, p_s_bh); + bforget (p_s_un_bh); + inode->i_blocks -= p_s_sb->s_blocksize / 512; + reiserfs_free_block(th, tmp); + if ( item_moved (&s_ih, p_s_path) ) { + need_research = 1; + break ; +#if 0 + reiserfs_prepare_for_journal(p_s_sb, + PATH_PLAST_BUFFER(p_s_path), + 1) ; + if ( comp_items(&s_ih, p_s_path) ) { + reiserfs_restore_prepared_buffer(p_s_sb, + PATH_PLAST_BUFFER(p_s_path)) ; + brelse(p_s_un_bh); + break; + } + *p_n_unfm_pointer = 0; + journal_mark_dirty (th,p_s_sb,PATH_PLAST_BUFFER(p_s_path)); + + reiserfs_free_block(th, p_s_sb, block_addr); + if (p_s_un_bh) { + mark_buffer_clean (p_s_un_bh); + brelse (p_s_un_bh); + } + if ( comp_items(&s_ih, p_s_path) ) { + break ; + } +#endif + } + + } + + /* a trick. If the buffer has been logged, this + ** will do nothing. If we've broken the loop without + ** logging it, it will restore the buffer + ** + */ + reiserfs_restore_prepared_buffer(p_s_sb, p_s_bh); + + if ( n_retry ) { + /* There is block in use. Wait, they should release it soon */ + +#ifdef CONFIG_REISERFS_CHECK + if ( *p_n_removed >= n_unfm_number ) + reiserfs_panic(p_s_sb, "PAP-5290: prepare_for_delete_or_cut: illegal case"); + if ( !(++n_repeat_counter % 500000) ) { + reiserfs_warning("PAP-5300: prepare_for_delete_or_cut: (pid %u): " + "could not delete item %k in (%d) iterations. New file length %Lu. (inode %Ld), Still trying\n", + current->pid, p_s_item_key, n_repeat_counter, n_new_file_length, inode->i_size); + if (n_repeat_counter == 5000000) { + print_block (PATH_PLAST_BUFFER(p_s_path), 3, + PATH_LAST_POSITION (p_s_path) - 2, PATH_LAST_POSITION (p_s_path) + 2); + reiserfs_panic(p_s_sb, "PAP-5305: prepare_for_delete_or_cut: key %k, new_file_length %Ld", + p_s_item_key, n_new_file_length); + } + } +#endif + +#ifdef __KERNEL__ + run_task_queue(&tq_disk); + current->policy |= SCHED_YIELD; + schedule(); +#endif + } + /* This loop can be optimized. */ + } while ( (*p_n_removed < n_unfm_number || need_research) && + search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_FOUND ); + +#ifdef CONFIG_REISERFS_CHECK + if ( *p_n_removed < n_unfm_number ) + reiserfs_panic(p_s_sb, "PAP-5310: prepare_for_delete_or_cut: indirect item is not found"); + + if (item_moved (&s_ih, p_s_path) ) { + printk("prepare_for_delete_or_cut: after while, comp failed, retry\n") ; + BUG (); + } +#endif + + if (c_mode == M_CUT) + pos_in_item (p_s_path) *= UNFM_P_SIZE; + return c_mode; + } +} + + +/* Calculate bytes number which will be deleted or cutted in the balance. */ +int calc_deleted_bytes_number( + struct tree_balance * p_s_tb, + char c_mode + ) { + int n_del_size; + struct item_head * p_le_ih = PATH_PITEM_HEAD(p_s_tb->tb_path); + + if ( is_statdata_le_ih (p_le_ih) ) + return 0; + + if ( is_direntry_le_ih (p_le_ih) ) { + // return EMPTY_DIR_SIZE; /* We delete emty directoris only. */ + // we can't use EMPTY_DIR_SIZE, as old format dirs have a different + // empty size. ick. FIXME, is this right? + // + return le16_to_cpu(p_le_ih->ih_item_len) ; + } + n_del_size = ( c_mode == M_DELETE ) ? le16_to_cpu (p_le_ih->ih_item_len) : -p_s_tb->insert_size[0]; + + if ( is_indirect_le_ih (p_le_ih) ) + n_del_size = (n_del_size/UNFM_P_SIZE)* + (PATH_PLAST_BUFFER(p_s_tb->tb_path)->b_size);// - get_ih_free_space (p_le_ih); + return n_del_size; +} + +static void init_tb_struct( + struct reiserfs_transaction_handle *th, + struct tree_balance * p_s_tb, + struct super_block * p_s_sb, + struct path * p_s_path, + int n_size + ) { + memset (p_s_tb,'\0',sizeof(struct tree_balance)); + p_s_tb->transaction_handle = th ; + p_s_tb->tb_sb = p_s_sb; + p_s_tb->tb_path = p_s_path; + PATH_OFFSET_PBUFFER(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = NULL; + PATH_OFFSET_POSITION(p_s_path, ILLEGAL_PATH_ELEMENT_OFFSET) = 0; + p_s_tb->insert_size[0] = n_size; +} + + + +void padd_item (char * item, int total_length, int length) +{ + int i; + + for (i = total_length; i > length; ) + item [--i] = 0; +} + + +/* Delete object item. */ +int reiserfs_delete_item (struct reiserfs_transaction_handle *th, + struct path * p_s_path, /* Path to the deleted item. */ + struct cpu_key * p_s_item_key, /* Key to search for the deleted item. */ + struct inode * p_s_inode,/* inode is here just to update i_blocks */ + struct buffer_head * p_s_un_bh) /* NULL or unformatted node pointer. */ +{ + struct super_block * p_s_sb = p_s_inode->i_sb; + struct tree_balance s_del_balance; + struct item_head s_ih; + int n_ret_value, + n_del_size, + n_removed; + +#ifdef CONFIG_REISERFS_CHECK + char c_mode; + int n_iter = 0; +#endif + + init_tb_struct(th, &s_del_balance, p_s_sb, p_s_path, 0/*size is unknown*/); + + while ( 1 ) { + n_removed = 0; + +#ifdef CONFIG_REISERFS_CHECK + n_iter++; + c_mode = +#endif + prepare_for_delete_or_cut(th, p_s_inode, p_s_path, p_s_item_key, &n_removed, &n_del_size, max_reiserfs_offset (p_s_inode)); + +#ifdef CONFIG_REISERFS_CHECK + if ( c_mode != M_DELETE ) + reiserfs_panic(p_s_sb, "PAP-5320: reiserfs_delete_item: mode must be M_DELETE"); +#endif + + copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); + s_del_balance.insert_size[0] = n_del_size; + + n_ret_value = fix_nodes(M_DELETE, &s_del_balance, NULL, 0); + if ( n_ret_value != REPEAT_SEARCH ) + break; + + // file system changed, repeat search + n_ret_value = search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path); + if (n_ret_value == IO_ERROR) + break; + if (n_ret_value == FILE_NOT_FOUND) { + reiserfs_warning ("vs-5340: reiserfs_delete_item: " + "no items of the file %K found\n", p_s_item_key); + break; + } + } /* while (1) */ + + if ( n_ret_value != CARRY_ON ) { + unfix_nodes(&s_del_balance); + return 0; + } + + // reiserfs_delete_item returns item length when success + n_ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE); + + if ( p_s_un_bh ) { + int off; + int block_off ; + char *data ; + + /* We are in direct2indirect conversion, so move tail contents + to the unformatted node */ + /* note, we do the copy before preparing the buffer because we + ** don't care about the contents of the unformatted node yet. + ** the only thing we really care about is the direct item's data + ** is in the unformatted node. + ** + ** Otherwise, we would have to call reiserfs_prepare_for_journal on + ** the unformatted node, which might schedule, meaning we'd have to + ** loop all the way back up to the start of the while loop. + ** + ** The unformatted node is prepared and logged after the do_balance. + ** + ** p_s_un_bh is from the page cache (all unformatted nodes are + ** from the page cache) and might be a highmem page. So, we + ** can't use p_s_un_bh->b_data. But, the page has already been + ** kmapped, so we can use page_address() + ** -clm + */ + + data = page_address(p_s_un_bh->b_page) ; + off = ((le_ih_k_offset (&s_ih) - 1) & (PAGE_CACHE_SIZE - 1)); + block_off = off & (p_s_un_bh->b_size - 1) ; + memcpy(data + off, + B_I_PITEM(PATH_PLAST_BUFFER(p_s_path), &s_ih), n_ret_value); + + /* clear out the rest of the block past the end of the file. */ + if (block_off + n_ret_value < p_s_un_bh->b_size) { + memset(data + off + n_ret_value, 0, + p_s_un_bh->b_size - block_off - n_ret_value) ; + } + } + + /* Perform balancing after all resources have been collected at once. */ + do_balance(&s_del_balance, NULL, NULL, M_DELETE); + + /* see comment above for why this is after the do_balance */ + if (p_s_un_bh) { + mark_buffer_dirty(p_s_un_bh) ; + } + + /* Return deleted body length */ + return n_ret_value; +} + + +/* Summary Of Mechanisms For Handling Collisions Between Processes: + + deletion of the body of the object is performed by iput(), with the + result that if multiple processes are operating on a file, the + deletion of the body of the file is deferred until the last process + that has an open inode performs its iput(). + + writes and truncates are protected from collisions by use of + semaphores. + + creates, linking, and mknod are protected from collisions with other + processes by making the reiserfs_add_entry() the last step in the + creation, and then rolling back all changes if there was a collision. + - Hans +*/ + + +/* this deletes item which never gets split */ +static void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th, + struct key * key) +{ + struct tree_balance tb; + INITIALIZE_PATH (path); + int item_len; + int tb_init = 0 ; + struct cpu_key cpu_key; + int retval; + + le_key2cpu_key (&cpu_key, key); + + while (1) { + retval = search_item (th->t_super, &cpu_key, &path); + if (retval == IO_ERROR) { + reiserfs_warning ("vs-: reiserfs_delete_solid_item: " + "i/o failure occured trying to delete %K\n", &cpu_key); + break; + } + if (retval != ITEM_FOUND) { + pathrelse (&path); + reiserfs_warning ("vs-: reiserfs_delete_solid_item: %k not found", + key); + break; + } + if (!tb_init) { + tb_init = 1 ; + item_len = le16_to_cpu (PATH_PITEM_HEAD (&path)->ih_item_len); + init_tb_struct (th, &tb, th->t_super, &path, - (IH_SIZE + item_len)); + } + + retval = fix_nodes (M_DELETE, &tb, NULL, 0); + if (retval == REPEAT_SEARCH) + continue; + + if (retval == CARRY_ON) { + do_balance (&tb, 0, 0, M_DELETE); + break; + } + + // IO_ERROR, NO_DISK_SPACE, etc + reiserfs_warning ("vs-: reiserfs_delete_solid_item: " + "could not delete %K due to fix_nodes failure\n", &cpu_key); + unfix_nodes (&tb); + break; + } + + reiserfs_check_path(&path) ; +} + + +void reiserfs_delete_object (struct reiserfs_transaction_handle *th, struct inode * inode) +{ + inode->i_size = 0; + + /* for directory this deletes item containing "." and ".." */ + reiserfs_do_truncate (th, inode, NULL, 0/*no timestamp updates*/); + + /* delete stat data */ + /* this debug code needs to go away. Trying to find a truncate race + ** -- clm -- 4/1/2000 + */ +#if 0 + if (inode->i_nlink != 0) { + reiserfs_warning("clm-4001: deleting inode with link count==%d\n", inode->i_nlink) ; + } +#endif + reiserfs_delete_solid_item (th, INODE_PKEY (inode)); +} + + +static int maybe_indirect_to_direct (struct reiserfs_transaction_handle *th, + struct inode * p_s_inode, + struct page *page, + struct path * p_s_path, + struct cpu_key * p_s_item_key, + loff_t n_new_file_size, + char * p_c_mode + ) { + struct super_block * p_s_sb = p_s_inode->i_sb; + int n_block_size = p_s_sb->s_blocksize; + int cut_bytes; + + if (n_new_file_size != p_s_inode->i_size) + BUG (); + + /* the page being sent in could be NULL if there was an i/o error + ** reading in the last block. The user will hit problems trying to + ** read the file, but for now we just skip the indirect2direct + */ + if (atomic_read(&p_s_inode->i_count) > 1 || + !tail_has_to_be_packed (p_s_inode) || + !page || p_s_inode->u.reiserfs_i.nopack) { + // leave tail in an unformatted node + *p_c_mode = M_SKIP_BALANCING; + cut_bytes = n_block_size - (n_new_file_size & (n_block_size - 1)); + pathrelse(p_s_path); + return cut_bytes; + } + /* Permorm the conversion to a direct_item. */ + /*return indirect_to_direct (p_s_inode, p_s_path, p_s_item_key, n_new_file_size, p_c_mode);*/ + return indirect2direct (th, p_s_inode, page, p_s_path, p_s_item_key, n_new_file_size, p_c_mode); +} + + +/* we did indirect_to_direct conversion. And we have inserted direct + item successesfully, but there were no disk space to cut unfm + pointer being converted. Therefore we have to delete inserted + direct item(s) */ +static void indirect_to_direct_roll_back (struct reiserfs_transaction_handle *th, struct inode * inode, struct path * path) +{ + struct cpu_key tail_key; + int tail_len; + int removed; + + make_cpu_key (&tail_key, inode, inode->i_size + 1, TYPE_DIRECT, 4);// !!!! + tail_key.key_length = 4; + + tail_len = (cpu_key_k_offset (&tail_key) & (inode->i_sb->s_blocksize - 1)) - 1; + while (tail_len) { + /* look for the last byte of the tail */ + if (search_for_position_by_key (inode->i_sb, &tail_key, path) == POSITION_NOT_FOUND) + reiserfs_panic (inode->i_sb, "vs-5615: indirect_to_direct_roll_back: found invalid item"); +#ifdef CONFIG_REISERFS_CHECK + if (path->pos_in_item != PATH_PITEM_HEAD (path)->ih_item_len - 1) + reiserfs_panic (inode->i_sb, "vs-5616: indirect_to_direct_roll_back: appended bytes found"); +#endif + PATH_LAST_POSITION (path) --; + + removed = reiserfs_delete_item (th, path, &tail_key, inode, 0/*unbh not needed*/); +#ifdef CONFIG_REISERFS_CHECK + if (removed <= 0 || removed > tail_len) + reiserfs_panic (inode->i_sb, "vs-5617: indirect_to_direct_roll_back: " + "there was tail %d bytes, removed item length %d bytes", + tail_len, removed); +#endif + tail_len -= removed; + set_cpu_key_k_offset (&tail_key, cpu_key_k_offset (&tail_key) - removed); + } + printk ("indirect_to_direct_roll_back: indirect_to_direct conversion has been rolled back due to lack of disk space\n"); + //mark_file_without_tail (inode); + mark_inode_dirty (inode); +} + + +/* (Truncate or cut entry) or delete object item. Returns < 0 on failure */ +int reiserfs_cut_from_item (struct reiserfs_transaction_handle *th, + struct path * p_s_path, + struct cpu_key * p_s_item_key, + struct inode * p_s_inode, + struct page *page, + loff_t n_new_file_size) +{ + struct super_block * p_s_sb = p_s_inode->i_sb; + /* Every function which is going to call do_balance must first + create a tree_balance structure. Then it must fill up this + structure by using the init_tb_struct and fix_nodes functions. + After that we can make tree balancing. */ + struct tree_balance s_cut_balance; + int n_cut_size = 0, /* Amount to be cut. */ + n_ret_value = CARRY_ON, + n_removed = 0, /* Number of the removed unformatted nodes. */ + n_is_inode_locked = 0; + char c_mode; /* Mode of the balance. */ + int retval2 = -1; + + + init_tb_struct(th, &s_cut_balance, p_s_inode->i_sb, p_s_path, n_cut_size); + + + /* Repeat this loop until we either cut the item without needing + to balance, or we fix_nodes without schedule occuring */ + while ( 1 ) { + /* Determine the balance mode, position of the first byte to + be cut, and size to be cut. In case of the indirect item + free unformatted nodes which are pointed to by the cut + pointers. */ + + c_mode = prepare_for_delete_or_cut(th, p_s_inode, p_s_path, p_s_item_key, &n_removed, + &n_cut_size, n_new_file_size); + if ( c_mode == M_CONVERT ) { + /* convert last unformatted node to direct item or leave + tail in the unformatted node */ +#ifdef CONFIG_REISERFS_CHECK + if ( n_ret_value != CARRY_ON ) + reiserfs_panic (p_s_sb, "PAP-5570: reiserfs_cut_from_item: can not convert twice"); +#endif + + n_ret_value = maybe_indirect_to_direct (th, p_s_inode, page, p_s_path, p_s_item_key, + n_new_file_size, &c_mode); + if ( c_mode == M_SKIP_BALANCING ) + /* tail has been left in the unformatted node */ + return n_ret_value; + + n_is_inode_locked = 1; + + /* removing of last unformatted node will change value we + have to return to truncate. Save it */ + retval2 = n_ret_value; + /*retval2 = p_s_sb->s_blocksize - (n_new_file_size & (p_s_sb->s_blocksize - 1));*/ + + /* So, we have performed the first part of the conversion: + inserting the new direct item. Now we are removing the + last unformatted node pointer. Set key to search for + it. */ + set_cpu_key_k_type (p_s_item_key, TYPE_INDIRECT); + p_s_item_key->key_length = 4; + n_new_file_size -= (n_new_file_size & (p_s_sb->s_blocksize - 1)); + set_cpu_key_k_offset (p_s_item_key, n_new_file_size + 1); + if ( search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path) == POSITION_NOT_FOUND ){ + print_block (PATH_PLAST_BUFFER (p_s_path), 3, PATH_LAST_POSITION (p_s_path) - 1, PATH_LAST_POSITION (p_s_path) + 1); + reiserfs_panic(p_s_sb, "PAP-5580: reiserfs_cut_from_item: item to convert does not exist (%k)", p_s_item_key); + } + continue; + } + if (n_cut_size == 0) { + pathrelse (p_s_path); + return 0; + } + + s_cut_balance.insert_size[0] = n_cut_size; + + n_ret_value = fix_nodes(c_mode, &s_cut_balance, NULL, 0); + if ( n_ret_value != REPEAT_SEARCH ) + break; + + n_ret_value = search_for_position_by_key(p_s_sb, p_s_item_key, p_s_path); + if (n_ret_value == POSITION_FOUND) + continue; + + reiserfs_warning ("PAP-5610: reiserfs_cut_from_item: item %K not found\n", p_s_item_key); + pathrelse (p_s_path); + return (n_ret_value == IO_ERROR) ? -EIO : -ENOENT; + } /* while */ + + // check fix_nodes results (IO_ERROR or NO_DISK_SPACE) + if ( n_ret_value != CARRY_ON ) { + if ( n_is_inode_locked ) { + // FIXME: this seems to be not needed: we are always able + // to cut item + indirect_to_direct_roll_back (th, p_s_inode, p_s_path); + } + if (n_ret_value == NO_DISK_SPACE) + reiserfs_warning (""); + unfix_nodes (&s_cut_balance); + return -EIO; + } + + /* go ahead and perform balancing */ + +#ifdef CONFIG_REISERFS_CHECK + if ( c_mode == M_PASTE || c_mode == M_INSERT ) + reiserfs_panic (p_s_sb, "PAP-5640: reiserfs_cut_from_item: illegal mode"); +#endif + + /* Calculate number of bytes that need to be cut from the item. */ + if (retval2 == -1) + n_ret_value = calc_deleted_bytes_number(&s_cut_balance, c_mode); + else + n_ret_value = retval2; + + if ( c_mode == M_DELETE ) { + struct item_head * p_le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path); + + if ( is_direct_le_ih (p_le_ih) && (le_ih_k_offset (p_le_ih) & (p_s_sb->s_blocksize - 1)) == 1 ) { + /* we delete first part of tail which was stored in direct + item(s) */ + // FIXME: this is to keep 3.5 happy + p_s_inode->u.reiserfs_i.i_first_direct_byte = U32_MAX; + p_s_inode->i_blocks -= p_s_sb->s_blocksize / 512; + } + } + +#ifdef CONFIG_REISERFS_CHECK + if (n_is_inode_locked) { + struct item_head * le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path); + /* we are going to complete indirect2direct conversion. Make + sure, that we exactly remove last unformatted node pointer + of the item */ + if (!is_indirect_le_ih (le_ih)) + reiserfs_panic (p_s_sb, "vs-5652: reiserfs_cut_from_item: " + "item must be indirect %h", le_ih); + + if (c_mode == M_DELETE && le16_to_cpu (le_ih->ih_item_len) != UNFM_P_SIZE) + reiserfs_panic (p_s_sb, "vs-5653: reiserfs_cut_from_item: " + "completing indirect2direct conversion indirect item %h" + "being deleted must be of 4 byte long", le_ih); + + if (c_mode == M_CUT && s_cut_balance.insert_size[0] != -UNFM_P_SIZE) { + reiserfs_panic (p_s_sb, "vs-5654: reiserfs_cut_from_item: " + "can not complete indirect2direct conversion of %h (CUT, insert_size==%d)", + le_ih, s_cut_balance.insert_size[0]); + } + /* it would be useful to make sure, that right neighboring + item is direct item of this file */ + } +#endif + + do_balance(&s_cut_balance, NULL, NULL, c_mode); + if ( n_is_inode_locked ) { + /* we've converted from indirect to direct, we must remove + ** ourselves from the list of pages that need flushing before + ** this transaction can commit + */ + reiserfs_remove_page_from_flush_list(th, p_s_inode) ; + p_s_inode->u.reiserfs_i.i_pack_on_close = 0 ; + } + return n_ret_value; +} + + +static void truncate_directory (struct reiserfs_transaction_handle *th, struct inode * inode) +{ + if (inode->i_nlink) + reiserfs_warning ("vs-5655: truncate_directory: link count != 0"); + + set_le_key_k_offset (ITEM_VERSION_1, INODE_PKEY (inode), DOT_OFFSET); + set_le_key_k_type (ITEM_VERSION_1, INODE_PKEY (inode), TYPE_DIRENTRY); + reiserfs_delete_solid_item (th, INODE_PKEY (inode)); + + set_le_key_k_offset (ITEM_VERSION_1, INODE_PKEY (inode), SD_OFFSET); + set_le_key_k_type (ITEM_VERSION_1, INODE_PKEY (inode), TYPE_STAT_DATA); +} + + + + +/* Truncate file to the new size. Note, this must be called with a transaction + already started */ +void reiserfs_do_truncate (struct reiserfs_transaction_handle *th, + struct inode * p_s_inode, /* ->i_size contains new + size */ + struct page *page, /* up to date for last block */ + int update_timestamps /* when it is called by + file_release to convert + the tail - no timestamps + should be updated */ + ) { + INITIALIZE_PATH (s_search_path); /* Path to the current object item. */ + struct item_head * p_le_ih; /* Pointer to an item header. */ + struct cpu_key s_item_key; /* Key to search for a previous file item. */ + loff_t n_file_size, /* Old file size. */ + n_new_file_size;/* New file size. */ + int n_deleted; /* Number of deleted or truncated bytes. */ + int retval; + + if ( ! (S_ISREG(p_s_inode->i_mode) || S_ISDIR(p_s_inode->i_mode) || S_ISLNK(p_s_inode->i_mode)) ) + return; + + if (S_ISDIR(p_s_inode->i_mode)) { + // deletion of directory - no need to update timestamps + truncate_directory (th, p_s_inode); + return; + } + + /* Get new file size. */ + n_new_file_size = p_s_inode->i_size; + + // FIXME: note, that key type is unimportant here + make_cpu_key (&s_item_key, p_s_inode, max_reiserfs_offset (p_s_inode), TYPE_DIRECT, 3); + + retval = search_for_position_by_key(p_s_inode->i_sb, &s_item_key, &s_search_path); + if (retval == IO_ERROR) { + reiserfs_warning ("vs-5657: reiserfs_do_truncate: " + "i/o failure occured trying to truncate %K\n", &s_item_key); + return; + } + if (retval == POSITION_FOUND || retval == FILE_NOT_FOUND) { + reiserfs_warning ("PAP-5660: reiserfs_do_truncate: " + "wrong result %d of search for %K\n", retval, &s_item_key); + return; + } + + s_search_path.pos_in_item --; + + /* Get real file size (total length of all file items) */ + p_le_ih = PATH_PITEM_HEAD(&s_search_path); + if ( is_statdata_le_ih (p_le_ih) ) + n_file_size = 0; + else { + loff_t offset = le_ih_k_offset (p_le_ih); + int bytes = op_bytes_number (p_le_ih,p_s_inode->i_sb->s_blocksize); + + /* this may mismatch with real file size: if last direct item + had no padding zeros and last unformatted node had no free + space, this file would have this file size */ + n_file_size = offset + bytes - 1; + } + + if ( n_file_size == 0 || n_file_size < n_new_file_size ) { + pathrelse(&s_search_path); + return; + } + /* Update key to search for the last file item. */ + set_cpu_key_k_offset (&s_item_key, n_file_size); + + do { + /* Cut or delete file item. */ + n_deleted = reiserfs_cut_from_item(th, &s_search_path, &s_item_key, p_s_inode, page, n_new_file_size); + if (n_deleted < 0) { + reiserfs_warning ("vs-5665: reiserfs_truncate_file: cut_from_item failed"); + reiserfs_check_path(&s_search_path) ; + return; + } + +#ifdef CONFIG_REISERFS_CHECK + if ( n_deleted > n_file_size ){ + reiserfs_panic (p_s_inode->i_sb, "PAP-5670: reiserfs_truncate_file: " + "reiserfs_truncate_file returns too big number: deleted %d, file_size %lu, item_key %k", + n_deleted, n_file_size, &s_item_key); + } +#endif + + /* Change key to search the last file item. */ + n_file_size -= n_deleted; + + set_cpu_key_k_offset (&s_item_key, n_file_size); + + /* While there are bytes to truncate and previous file item is presented in the tree. */ + + /* + ** This loop could take a really long time, and could log + ** many more blocks than a transaction can hold. So, we do a polite + ** journal end here, and if the transaction needs ending, we make + ** sure the file is consistent before ending the current trans + ** and starting a new one + */ + if (journal_transaction_should_end(th, th->t_blocks_allocated)) { + int orig_len_alloc = th->t_blocks_allocated ; + decrement_counters_in_path(&s_search_path) ; + + if (update_timestamps) { + p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME; + // FIXME: sd gets wrong size here + } + reiserfs_update_sd(th, p_s_inode) ; + + journal_end(th, p_s_inode->i_sb, orig_len_alloc) ; + journal_begin(th, p_s_inode->i_sb, orig_len_alloc) ; + } + } while ( n_file_size > ROUND_UP (n_new_file_size) && + search_for_position_by_key(p_s_inode->i_sb, &s_item_key, &s_search_path) == POSITION_FOUND ) ; + +#ifdef CONFIG_REISERFS_CHECK + if ( n_file_size > ROUND_UP (n_new_file_size) ) + reiserfs_panic (p_s_inode->i_sb, "PAP-5680: reiserfs_truncate_file: " + "truncate did not finish: new_file_size %Ld, current %Ld, oid %d\n", + n_new_file_size, n_file_size, s_item_key.on_disk_key.k_objectid); +#endif + + if (update_timestamps) { + // this is truncate, not file closing + p_s_inode->i_mtime = p_s_inode->i_ctime = CURRENT_TIME; + } + reiserfs_update_sd (th, p_s_inode); + + pathrelse(&s_search_path) ; +} + + +#ifdef CONFIG_REISERFS_CHECK +// this makes sure, that we __append__, not overwrite or add holes +static void check_research_for_paste (struct path * path, struct cpu_key * p_s_key) +{ + struct item_head * found_ih = get_ih (path); + + if (is_direct_le_ih (found_ih)) { + if (le_ih_k_offset (found_ih) + op_bytes_number (found_ih, get_bh (path)->b_size) != + cpu_key_k_offset (p_s_key) || + op_bytes_number (found_ih, get_bh (path)->b_size) != pos_in_item (path)) + reiserfs_panic (0, "PAP-5720: check_research_for_paste: " + "found direct item %h or position (%d) does not match to key %K", + found_ih, pos_in_item (path), p_s_key); + } + if (is_indirect_le_ih (found_ih)) { + if (le_ih_k_offset (found_ih) + op_bytes_number (found_ih, get_bh (path)->b_size) != cpu_key_k_offset (p_s_key) || + I_UNFM_NUM (found_ih) != pos_in_item (path) || + get_ih_free_space (found_ih) != 0) + reiserfs_panic (0, "PAP-5730: check_research_for_paste: " + "found indirect item (%h) or position (%d) does not match to key (%K)", + found_ih, pos_in_item (path), p_s_key); + } +} +#endif /* config reiserfs check */ + + +/* Paste bytes to the existing item. Returns bytes number pasted into the item. */ +int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th, + struct path * p_s_search_path, /* Path to the pasted item. */ + struct cpu_key * p_s_key, /* Key to search for the needed item.*/ + const char * p_c_body, /* Pointer to the bytes to paste. */ + int n_pasted_size) /* Size of pasted bytes. */ +{ + struct tree_balance s_paste_balance; + int retval; + + init_tb_struct(th, &s_paste_balance, th->t_super, p_s_search_path, n_pasted_size); + + while ( (retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, p_c_body)) == REPEAT_SEARCH ) { + /* file system changed while we were in the fix_nodes */ + retval = search_for_position_by_key (th->t_super, p_s_key, p_s_search_path); + if (retval == IO_ERROR) + return -EIO; + if (retval == POSITION_FOUND) { + reiserfs_warning ("PAP-5710: reiserfs_paste_into_item: entry or pasted byte (%K) exists", p_s_key); + pathrelse (p_s_search_path); + return -EEXIST; + } + +#ifdef CONFIG_REISERFS_CHECK + check_research_for_paste (p_s_search_path, p_s_key); +#endif + } + + /* Perform balancing after all resources are collected by fix_nodes, and + accessing them will not risk triggering schedule. */ + if ( retval == CARRY_ON ) { + do_balance(&s_paste_balance, NULL/*ih*/, p_c_body, M_PASTE); + return 0; + } + + unfix_nodes(&s_paste_balance); + return (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; +} + + +/* Insert new item into the buffer at the path. */ +int reiserfs_insert_item(struct reiserfs_transaction_handle *th, + struct path * p_s_path, /* Path to the inserteded item. */ + struct cpu_key * key, + struct item_head * p_s_ih, /* Pointer to the item header to insert.*/ + const char * p_c_body) /* Pointer to the bytes to insert. */ +{ + struct tree_balance s_ins_balance; + int retval; + + init_tb_struct(th, &s_ins_balance, th->t_super, p_s_path, IH_SIZE + p_s_ih->ih_item_len); + + /* + if (p_c_body == 0) + n_zeros_num = p_s_ih->ih_item_len; + */ + // le_key2cpu_key (&key, &(p_s_ih->ih_key)); + + while ( (retval = fix_nodes(M_INSERT, &s_ins_balance, p_s_ih, p_c_body)) == REPEAT_SEARCH) { + /* file system changed while we were in the fix_nodes */ + retval = search_item (th->t_super, key, p_s_path); + if (retval == IO_ERROR) + return -EIO; + + if (retval == ITEM_FOUND) { + reiserfs_warning ("PAP-5760: reiserfs_insert_item: " + "key %K already exists in the tree\n", key); + pathrelse (p_s_path); + return -EEXIST; + } + } + + /* make balancing after all resources will be collected at a time */ + if ( retval == CARRY_ON ) { + do_balance (&s_ins_balance, p_s_ih, p_c_body, M_INSERT); + return 0; + } + + unfix_nodes(&s_ins_balance); + return (retval == NO_DISK_SPACE) ? -ENOSPC : -EIO; +} + + + + diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/super.c linux/fs/reiserfs/super.c --- v2.4.0/linux/fs/reiserfs/super.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/super.c Mon Jan 15 15:31:19 2001 @@ -0,0 +1,879 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include +#include +#include +#include + +#else + +#include "nokernel.h" +#include // for simple_strtoul + +#endif + +#define SUPPORT_OLD_FORMAT + +#define REISERFS_OLD_BLOCKSIZE 4096 +#define REISERFS_SUPER_MAGIC_STRING_OFFSET_NJ 20 + + +#if 0 +// this one is not used currently +inline void reiserfs_mark_buffer_dirty (struct buffer_head * bh, int flag) +{ + mark_buffer_dirty (bh, flag); +} +#endif + +// +// a portion of this function, particularly the VFS interface portion, +// was derived from minix or ext2's analog and evolved as the +// prototype did. You should be able to tell which portion by looking +// at the ext2 code and comparing. It's subfunctions contain no code +// used as a template unless they are so labeled. +// +void reiserfs_write_super (struct super_block * s) +{ + + int dirty = 0 ; + lock_kernel() ; + if (!(s->s_flags & MS_RDONLY)) { + dirty = flush_old_commits(s, 1) ; + } + s->s_dirt = dirty; + unlock_kernel() ; +} + +// +// a portion of this function, particularly the VFS interface portion, +// was derived from minix or ext2's analog and evolved as the +// prototype did. You should be able to tell which portion by looking +// at the ext2 code and comparing. It's subfunctions contain no code +// used as a template unless they are so labeled. +// +void reiserfs_write_super_lockfs (struct super_block * s) +{ + + int dirty = 0 ; + struct reiserfs_transaction_handle th ; + lock_kernel() ; + if (!(s->s_flags & MS_RDONLY)) { + journal_begin(&th, s, 1) ; + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); + reiserfs_block_writes(&th) ; + journal_end(&th, s, 1) ; + } + s->s_dirt = dirty; + unlock_kernel() ; +} + +void reiserfs_unlockfs(struct super_block *s) { + reiserfs_allow_writes(s) ; +} + +// +// a portion of this function, particularly the VFS interface portion, +// was derived from minix or ext2's analog and evolved as the +// prototype did. You should be able to tell which portion by looking +// at the ext2 code and comparing. It's subfunctions contain no code +// used as a template unless they are so labeled. +// +/* there should be no suspected recipients already. True and cautious + bitmaps should not differ. We only have to free preserve list and + write both bitmaps */ +void reiserfs_put_super (struct super_block * s) +{ + int i; + struct reiserfs_transaction_handle th ; + + /* change file system state to current state if it was mounted with read-write permissions */ + if (!(s->s_flags & MS_RDONLY)) { + journal_begin(&th, s, 10) ; + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; + s->u.reiserfs_sb.s_rs->s_state = le16_to_cpu (s->u.reiserfs_sb.s_mount_state); + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); + } + + /* note, journal_release checks for readonly mount, and can decide not + ** to do a journal_end + */ + journal_release(&th, s) ; + + for (i = 0; i < SB_BMAP_NR (s); i ++) + brelse (SB_AP_BITMAP (s)[i]); + + reiserfs_kfree (SB_AP_BITMAP (s), sizeof (struct buffer_head *) * SB_BMAP_NR (s), s); + + brelse (SB_BUFFER_WITH_SB (s)); + + print_statistics (s); + + if (s->u.reiserfs_sb.s_kmallocs != 0) { + reiserfs_warning ("vs-2004: reiserfs_put_super: aloocated memory left %d\n", + s->u.reiserfs_sb.s_kmallocs); + } + + return; +} + +struct super_operations reiserfs_sops = +{ + read_inode: reiserfs_read_inode, + read_inode2: reiserfs_read_inode2, + write_inode: reiserfs_write_inode, + dirty_inode: reiserfs_dirty_inode, + delete_inode: reiserfs_delete_inode, + put_super: reiserfs_put_super, + write_super: reiserfs_write_super, + write_super_lockfs: reiserfs_write_super_lockfs, + unlockfs: reiserfs_unlockfs, + statfs: reiserfs_statfs, + remount_fs: reiserfs_remount, + +}; + +/* this was (ext2)parse_options */ +static int parse_options (char * options, unsigned long * mount_options, unsigned long * blocks) +{ + char * this_char; + char * value; + + *blocks = 0; + if (!options) + /* use default configuration: create tails, journaling on, no + conversion to newest format */ + return 1; + for (this_char = strtok (options, ","); this_char != NULL; this_char = strtok (NULL, ",")) { + if ((value = strchr (this_char, '=')) != NULL) + *value++ = 0; + if (!strcmp (this_char, "notail")) { + set_bit (NOTAIL, mount_options); + } else if (!strcmp (this_char, "conv")) { + // if this is set, we update super block such that + // the partition will not be mounable by 3.5.x anymore + set_bit (REISERFS_CONVERT, mount_options); + } else if (!strcmp (this_char, "noborder")) { + /* this is used for benchmarking + experimental variations, it is not + intended for users to use, only for + developers who want to casually + hack in something to test */ + set_bit (REISERFS_NO_BORDER, mount_options); + } else if (!strcmp (this_char, "no_unhashed_relocation")) { + set_bit (REISERFS_NO_UNHASHED_RELOCATION, mount_options); + } else if (!strcmp (this_char, "hashed_relocation")) { + set_bit (REISERFS_HASHED_RELOCATION, mount_options); + } else if (!strcmp (this_char, "test4")) { + set_bit (REISERFS_TEST4, mount_options); + } else if (!strcmp (this_char, "nolog")) { + reiserfs_warning("reiserfs: nolog mount option not supported yet\n"); + } else if (!strcmp (this_char, "replayonly")) { + set_bit (REPLAYONLY, mount_options); + } else if (!strcmp (this_char, "resize")) { + if (value && *value){ + *blocks = simple_strtoul (value, &value, 0); + } else { + printk("reiserfs: resize option requires a value\n"); + return 0; + } + } else if (!strcmp (this_char, "hash")) { + if (value && *value) { + /* if they specify any hash option, we force detection + ** to make sure they aren't using the wrong hash + */ + if (!strcmp(value, "rupasov")) { + set_bit (FORCE_RUPASOV_HASH, mount_options); + set_bit (FORCE_HASH_DETECT, mount_options); + } else if (!strcmp(value, "tea")) { + set_bit (FORCE_TEA_HASH, mount_options); + set_bit (FORCE_HASH_DETECT, mount_options); + } else if (!strcmp(value, "r5")) { + set_bit (FORCE_R5_HASH, mount_options); + set_bit (FORCE_HASH_DETECT, mount_options); + } else if (!strcmp(value, "detect")) { + set_bit (FORCE_HASH_DETECT, mount_options); + } else { + printk("reiserfs: invalid hash function specified\n") ; + return 0 ; + } + } else { + printk("reiserfs: hash option requires a value\n"); + return 0 ; + } + } else { + printk ("reiserfs: Unrecognized mount option %s\n", this_char); + return 0; + } + } + return 1; +} + + +int reiserfs_is_super(struct super_block *s) { + return (s->s_dev != 0 && s->s_op == &reiserfs_sops) ; +} + + +// +// a portion of this function, particularly the VFS interface portion, +// was derived from minix or ext2's analog and evolved as the +// prototype did. You should be able to tell which portion by looking +// at the ext2 code and comparing. It's subfunctions contain no code +// used as a template unless they are so labeled. +// +int reiserfs_remount (struct super_block * s, int * flags, char * data) +{ + struct reiserfs_super_block * rs; + struct reiserfs_transaction_handle th ; + unsigned long blocks; + unsigned long mount_options; + + rs = SB_DISK_SUPER_BLOCK (s); + + if (!parse_options(data, &mount_options, &blocks)) + return 0; + + if(blocks) { + int rc = reiserfs_resize(s, blocks); + if (rc != 0) + return rc; + } + + if ((unsigned long)(*flags & MS_RDONLY) == (s->s_flags & MS_RDONLY)) { + /* there is nothing to do to remount read-only fs as read-only fs */ + return 0; + } + + if (*flags & MS_RDONLY) { + /* try to remount file system with read-only permissions */ + if (le16_to_cpu (rs->s_state) == REISERFS_VALID_FS || s->u.reiserfs_sb.s_mount_state != REISERFS_VALID_FS) { + return 0; + } + + journal_begin(&th, s, 10) ; + /* Mounting a rw partition read-only. */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; + rs->s_state = cpu_to_le16 (s->u.reiserfs_sb.s_mount_state); + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); + s->s_dirt = 0; + } else { + s->u.reiserfs_sb.s_mount_state = le16_to_cpu(rs->s_state) ; + s->s_flags &= ~MS_RDONLY ; /* now it is safe to call journal_begin */ + journal_begin(&th, s, 10) ; + + /* Mount a partition which is read-only, read-write */ + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; + s->u.reiserfs_sb.s_mount_state = le16_to_cpu (rs->s_state); + s->s_flags &= ~MS_RDONLY; + rs->s_state = cpu_to_le16 (REISERFS_ERROR_FS); + /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */ + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); + s->s_dirt = 0; + s->u.reiserfs_sb.s_mount_state = REISERFS_VALID_FS ; + } + /* this will force a full flush of all journal lists */ + SB_JOURNAL(s)->j_must_wait = 1 ; + journal_end(&th, s, 10) ; + return 0; +} + + +static int read_bitmaps (struct super_block * s) +{ + int i, bmp, dl ; + struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK(s); + + SB_AP_BITMAP (s) = reiserfs_kmalloc (sizeof (struct buffer_head *) * le16_to_cpu (rs->s_bmap_nr), GFP_BUFFER, s); + if (SB_AP_BITMAP (s) == 0) + return 1; + memset (SB_AP_BITMAP (s), 0, sizeof (struct buffer_head *) * le16_to_cpu (rs->s_bmap_nr)); + + /* reiserfs leaves the first 64k unused so that any partition + labeling scheme currently used will have enough space. Then we + need one block for the super. -Hans */ + bmp = (REISERFS_DISK_OFFSET_IN_BYTES / s->s_blocksize) + 1; /* first of bitmap blocks */ + SB_AP_BITMAP (s)[0] = reiserfs_bread (s->s_dev, bmp, s->s_blocksize); + if(!SB_AP_BITMAP(s)[0]) + return 1; + for (i = 1, bmp = dl = rs->s_blocksize * 8; i < le16_to_cpu (rs->s_bmap_nr); i ++) { + SB_AP_BITMAP (s)[i] = reiserfs_bread (s->s_dev, bmp, s->s_blocksize); + if (!SB_AP_BITMAP (s)[i]) + return 1; + bmp += dl; + } + + return 0; +} + +static int read_old_bitmaps (struct super_block * s) +{ + int i ; + struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK(s); + int bmp1 = (REISERFS_OLD_DISK_OFFSET_IN_BYTES / s->s_blocksize) + 1; /* first of bitmap blocks */ + + /* read true bitmap */ + SB_AP_BITMAP (s) = reiserfs_kmalloc (sizeof (struct buffer_head *) * le16_to_cpu (rs->s_bmap_nr), GFP_BUFFER, s); + if (SB_AP_BITMAP (s) == 0) + return 1; + + memset (SB_AP_BITMAP (s), 0, sizeof (struct buffer_head *) * le16_to_cpu (rs->s_bmap_nr)); + + for (i = 0; i < le16_to_cpu (rs->s_bmap_nr); i ++) { + SB_AP_BITMAP (s)[i] = reiserfs_bread (s->s_dev, bmp1 + i, s->s_blocksize); + if (!SB_AP_BITMAP (s)[i]) + return 1; + } + + return 0; +} + +void check_bitmap (struct super_block * s) +{ + int i = 0; + int free = 0; + char * buf; + + while (i < SB_BLOCK_COUNT (s)) { + buf = SB_AP_BITMAP (s)[i / (s->s_blocksize * 8)]->b_data; + if (!reiserfs_test_le_bit (i % (s->s_blocksize * 8), buf)) + free ++; + i ++; + } + + if (free != SB_FREE_BLOCKS (s)) + reiserfs_warning ("vs-4000: check_bitmap: %d free blocks, must be %d\n", + free, SB_FREE_BLOCKS (s)); +} + +#ifdef SUPPORT_OLD_FORMAT + +/* support old disk layout */ +static int read_old_super_block (struct super_block * s, int size) +{ + struct buffer_head * bh; + struct reiserfs_super_block * rs; + + printk("read_old_super_block: try to find super block in old location\n"); + /* there are only 4k-sized blocks in v3.5.10 */ + if (size != REISERFS_OLD_BLOCKSIZE) + set_blocksize(s->s_dev, REISERFS_OLD_BLOCKSIZE); + bh = bread (s->s_dev, + REISERFS_OLD_DISK_OFFSET_IN_BYTES / REISERFS_OLD_BLOCKSIZE, + REISERFS_OLD_BLOCKSIZE); + if (!bh) { + printk("read_old_super_block: unable to read superblock on dev %s\n", kdevname(s->s_dev)); + return 1; + } + + rs = (struct reiserfs_super_block *)bh->b_data; + if (strncmp (rs->s_magic, REISERFS_SUPER_MAGIC_STRING, strlen ( REISERFS_SUPER_MAGIC_STRING))) { + /* pre-journaling version check */ + if(!strncmp((char*)rs + REISERFS_SUPER_MAGIC_STRING_OFFSET_NJ, + REISERFS_SUPER_MAGIC_STRING, strlen(REISERFS_SUPER_MAGIC_STRING))) { + printk("read_old_super_blockr: a pre-journaling reiserfs filesystem isn't suitable there.\n"); + brelse(bh); + return 1; + } + + brelse (bh); + printk ("read_old_super_block: can't find a reiserfs filesystem on dev %s.\n", kdevname(s->s_dev)); + return 1; + } + + if(REISERFS_OLD_BLOCKSIZE != le16_to_cpu (rs->s_blocksize)) { + printk("read_old_super_block: blocksize mismatch, super block corrupted\n"); + brelse(bh); + return 1; + } + + s->s_blocksize = REISERFS_OLD_BLOCKSIZE; + s->s_blocksize_bits = 0; + while ((1 << s->s_blocksize_bits) != s->s_blocksize) + s->s_blocksize_bits ++; + + SB_BUFFER_WITH_SB (s) = bh; + SB_DISK_SUPER_BLOCK (s) = rs; + s->s_op = &reiserfs_sops; + return 0; +} +#endif + +// +// FIXME: mounting old filesystems we _must_ change magic string to +// make then unmountable by reiserfs of 3.5.x +// +static int read_super_block (struct super_block * s, int size) +{ + struct buffer_head * bh; + struct reiserfs_super_block * rs; + + bh = bread (s->s_dev, REISERFS_DISK_OFFSET_IN_BYTES / size, size); + if (!bh) { + printk("read_super_block: unable to read superblock on dev %s\n", kdevname(s->s_dev)); + return 1; + } + + rs = (struct reiserfs_super_block *)bh->b_data; + if (!is_reiserfs_magic_string (rs)) { + printk ("read_super_block: can't find a reiserfs filesystem on dev %s\n", + kdevname(s->s_dev)); + brelse (bh); + return 1; + } + + // + // ok, reiserfs signature (old or new) found in 64-th 1k block of + // the device + // + +#ifndef SUPPORT_OLD_FORMAT + // with SUPPORT_OLD_FORMAT undefined - detect old format by + // checking super block version + if (le16_to_cpu (rs->s_version) != REISERFS_VERSION_2) { + brelse (bh); + printk ("read_super_block: unsupported version (%d) of reiserfs found on dev %s\n", + le16_to_cpu (rs->s_version), kdevname(s->s_dev)); + return 1; + } +#endif + + s->s_blocksize = le16_to_cpu (rs->s_blocksize); + s->s_blocksize_bits = 0; + while ((1 << s->s_blocksize_bits) != s->s_blocksize) + s->s_blocksize_bits ++; + + brelse (bh); + + if (s->s_blocksize != size) + set_blocksize (s->s_dev, s->s_blocksize); + bh = reiserfs_bread (s->s_dev, REISERFS_DISK_OFFSET_IN_BYTES / s->s_blocksize, s->s_blocksize); + if (!bh) { + printk("read_super_block: unable to read superblock on dev %s\n", kdevname(s->s_dev)); + return 1; + } + + rs = (struct reiserfs_super_block *)bh->b_data; + if (!is_reiserfs_magic_string (rs) || + le16_to_cpu (rs->s_blocksize) != s->s_blocksize) { + brelse (bh); + printk ("read_super_block: can't find a reiserfs filesystem on dev %s.\n", kdevname(s->s_dev)); + return 1; + } + /* must check to be sure we haven't pulled an old format super out + ** of the old format's log. This is a kludge of a check, but it + ** will work. If block we've just read in is inside the + ** journal for that super, it can't be valid. + */ + if (bh->b_blocknr >= le32_to_cpu(rs->s_journal_block) && + bh->b_blocknr < (le32_to_cpu(rs->s_journal_block) + JOURNAL_BLOCK_COUNT)) { + brelse(bh) ; + printk("super-459: read_super_block: super found at block %lu is within its own log. " + "It must not be of this format type.\n", bh->b_blocknr) ; + return 1 ; + } + SB_BUFFER_WITH_SB (s) = bh; + SB_DISK_SUPER_BLOCK (s) = rs; + s->s_op = &reiserfs_sops; + return 0; +} + +/* after journal replay, reread all bitmap and super blocks */ +static int reread_meta_blocks(struct super_block *s) { + int i ; + ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))) ; + wait_on_buffer(SB_BUFFER_WITH_SB(s)) ; + if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { + printk("reread_meta_blocks, error reading the super\n") ; + return 1 ; + } + + for (i = 0; i < SB_BMAP_NR(s) ; i++) { + ll_rw_block(READ, 1, &(SB_AP_BITMAP(s)[i])) ; + wait_on_buffer(SB_AP_BITMAP(s)[i]) ; + if (!buffer_uptodate(SB_AP_BITMAP(s)[i])) { + printk("reread_meta_blocks, error reading bitmap block number %d at %ld\n", i, SB_AP_BITMAP(s)[i]->b_blocknr) ; + return 1 ; + } + } + return 0 ; + +} + + +///////////////////////////////////////////////////// +// hash detection stuff + + +// if root directory is empty - we set default - Yura's - hash and +// warn about it +// FIXME: we look for only one name in a directory. If tea and yura +// bith have the same value - we ask user to send report to the +// mailing list +__u32 find_hash_out (struct super_block * s) +{ + int retval; + struct inode * inode; + struct cpu_key key; + INITIALIZE_PATH (path); + struct reiserfs_dir_entry de; + __u32 hash = DEFAULT_HASH; + + inode = s->s_root->d_inode; + + while (1) { + make_cpu_key (&key, inode, ~0, TYPE_DIRENTRY, 3); + retval = search_by_entry_key (s, &key, &path, &de); + if (retval == IO_ERROR) { + pathrelse (&path); + return UNSET_HASH ; + } + if (retval == NAME_NOT_FOUND) + de.de_entry_num --; + set_de_name_and_namelen (&de); + if (le32_to_cpu (de.de_deh[de.de_entry_num].deh_offset) == DOT_DOT_OFFSET) { + /* allow override in this case */ + if (reiserfs_rupasov_hash(s)) { + hash = YURA_HASH ; + } + reiserfs_warning("reiserfs: FS seems to be empty, autodetect " + "is using the default hash\n"); + break; + } + if (GET_HASH_VALUE(yura_hash (de.de_name, de.de_namelen)) == + GET_HASH_VALUE(keyed_hash (de.de_name, de.de_namelen))) { + reiserfs_warning ("reiserfs: Could not detect hash function " + "please mount with -o hash={tea,rupasov,r5}\n") ; + hash = UNSET_HASH ; + break; + } + if (GET_HASH_VALUE(le32_to_cpu(de.de_deh[de.de_entry_num].deh_offset))== + GET_HASH_VALUE (yura_hash (de.de_name, de.de_namelen))) + hash = YURA_HASH; + else + hash = TEA_HASH; + break; + } + + pathrelse (&path); + return hash; +} + +// finds out which hash names are sorted with +static int what_hash (struct super_block * s) +{ + __u32 code; + + code = le32_to_cpu (s->u.reiserfs_sb.s_rs->s_hash_function_code); + + /* reiserfs_hash_detect() == true if any of the hash mount options + ** were used. We must check them to make sure the user isn't + ** using a bad hash value + */ + if (code == UNSET_HASH || reiserfs_hash_detect(s)) + code = find_hash_out (s); + + if (code != UNSET_HASH && reiserfs_hash_detect(s)) { + /* detection has found the hash, and we must check against the + ** mount options + */ + if (reiserfs_rupasov_hash(s) && code != YURA_HASH) { + printk("REISERFS: Error, tea hash detected, " + "unable to force rupasov hash\n") ; + code = UNSET_HASH ; + } else if (reiserfs_tea_hash(s) && code != TEA_HASH) { + printk("REISERFS: Error, rupasov hash detected, " + "unable to force tea hash\n") ; + code = UNSET_HASH ; + } else if (reiserfs_r5_hash(s) && code != R5_HASH) { + printk("REISERFS: Error, r5 hash detected, " + "unable to force r5 hash\n") ; + code = UNSET_HASH ; + } + } else { + /* find_hash_out was not called or could not determine the hash */ + if (reiserfs_rupasov_hash(s)) { + code = YURA_HASH ; + } else if (reiserfs_tea_hash(s)) { + code = TEA_HASH ; + } else if (reiserfs_r5_hash(s)) { + code = R5_HASH ; + } + } + + /* if we are mounted RW, and we have a new valid hash code, update + ** the super + */ + if (code != UNSET_HASH && + !(s->s_flags & MS_RDONLY) && + code != le32_to_cpu (s->u.reiserfs_sb.s_rs->s_hash_function_code)) { + s->u.reiserfs_sb.s_rs->s_hash_function_code = cpu_to_le32(code) ; + } + return code; +} + +// return pointer to appropriate function +static hashf_t hash_function (struct super_block * s) +{ + switch (what_hash (s)) { + case TEA_HASH: + reiserfs_warning ("Using tea hash to sort names\n"); + return keyed_hash; + case YURA_HASH: + reiserfs_warning ("Using rupasov hash to sort names\n"); + return yura_hash; + case R5_HASH: + reiserfs_warning ("Using r5 hash to sort names\n"); + return r5_hash; + } + return NULL; +} + +// this is used to set up correct value for old partitions +int function2code (hashf_t func) +{ + if (func == keyed_hash) + return TEA_HASH; + if (func == yura_hash) + return YURA_HASH; + if (func == r5_hash) + return R5_HASH; + + BUG() ; // should never happen + + return 0; +} + + +// +// a portion of this function, particularly the VFS interface portion, +// was derived from minix or ext2's analog and evolved as the +// prototype did. You should be able to tell which portion by looking +// at the ext2 code and comparing. It's subfunctions contain no code +// used as a template unless they are so labeled. +// +struct super_block * reiserfs_read_super (struct super_block * s, void * data, int silent) +{ + int size; + struct inode *root_inode; + kdev_t dev = s->s_dev; + int j; + extern int *blksize_size[]; + struct reiserfs_transaction_handle th ; + int old_format = 0; + unsigned long blocks; + int jinit_done = 0 ; + struct reiserfs_iget4_args args ; + + + memset (&s->u.reiserfs_sb, 0, sizeof (struct reiserfs_sb_info)); + + if (parse_options ((char *) data, &(s->u.reiserfs_sb.s_mount_opt), &blocks) == 0) { + return NULL; + } + + if (blocks) { + printk("reserfs: resize option for remount only\n"); + return NULL; + } + + if (blksize_size[MAJOR(dev)] && blksize_size[MAJOR(dev)][MINOR(dev)] != 0) { + /* as blocksize is set for partition we use it */ + size = blksize_size[MAJOR(dev)][MINOR(dev)]; + } else { + size = BLOCK_SIZE; + set_blocksize (s->s_dev, BLOCK_SIZE); + } + + /* read block (64-th 1k block), which can contain reiserfs super block */ + if (read_super_block (s, size)) { +#ifdef SUPPORT_OLD_FORMAT + // try old format (undistributed bitmap, super block in 8-th 1k block of a device) + if(read_old_super_block(s,size)) + goto error; + else + old_format = 1; +#endif + goto error ; + } + + s->u.reiserfs_sb.s_mount_state = le16_to_cpu (SB_DISK_SUPER_BLOCK (s)->s_state); /* journal victim */ + s->u.reiserfs_sb.s_mount_state = REISERFS_VALID_FS ; + + if (old_format ? read_old_bitmaps(s) : read_bitmaps(s)) { + printk ("reiserfs_read_super: unable to read bitmap\n"); + goto error; + } +#ifdef CONFIG_REISERFS_CHECK + printk("reiserfs:warning: CONFIG_REISERFS_CHECK is set ON\n"); + printk("reiserfs:warning: - it is slow mode for debugging.\n"); +#endif + + // set_device_ro(s->s_dev, 1) ; + if (journal_init(s)) { + printk("reiserfs_read_super: unable to initialize journal space\n") ; + goto error ; + } else { + jinit_done = 1 ; /* once this is set, journal_release must be called + ** if we error out of the mount + */ + } + if (reread_meta_blocks(s)) { + printk("reiserfs_read_super: unable to reread meta blocks after journal init\n") ; + goto error ; + } + + if (replay_only (s)) + goto error; + + if (is_read_only(s->s_dev) && !(s->s_flags & MS_RDONLY)) { + printk("clm-7000: Detected readonly device, marking FS readonly\n") ; + s->s_flags |= MS_RDONLY ; + } + args.objectid = REISERFS_ROOT_PARENT_OBJECTID ; + root_inode = iget4 (s, REISERFS_ROOT_OBJECTID, 0, (void *)(&args)); + if (!root_inode) { + printk ("reiserfs_read_super: get root inode failed\n"); + goto error; + } + + s->s_root = d_alloc_root(root_inode); + if (!s->s_root) { + iput(root_inode); + goto error; + } + + // define and initialize hash function + s->u.reiserfs_sb.s_hash_function = hash_function (s); + if (s->u.reiserfs_sb.s_hash_function == NULL) { + dput(s->s_root) ; + s->s_root = NULL ; + goto error ; + } + + if (!(s->s_flags & MS_RDONLY)) { + struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s); + + journal_begin(&th, s, 1) ; + reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; + + rs->s_state = cpu_to_le16 (REISERFS_ERROR_FS); + + if (strncmp (rs->s_magic, REISER2FS_SUPER_MAGIC_STRING, + strlen ( REISER2FS_SUPER_MAGIC_STRING))) { + if (le16_to_cpu(rs->s_version) != 0) + BUG (); + // filesystem created under 3.5.x found + if (!old_format_only (s)) { + reiserfs_warning("reiserfs: converting 3.5.x filesystem to the new format\n") ; + // after this 3.5.x will not be able to mount this partition + memcpy (rs->s_magic, REISER2FS_SUPER_MAGIC_STRING, + sizeof (REISER2FS_SUPER_MAGIC_STRING)); + + reiserfs_convert_objectid_map_v1(s) ; + } else { + reiserfs_warning("reiserfs: using 3.5.x disk format\n") ; + } + } else { + // new format found + set_bit (REISERFS_CONVERT, &(s->u.reiserfs_sb.s_mount_opt)); + } + + // mark hash in super block: it could be unset. overwrite should be ok + rs->s_hash_function_code = cpu_to_le32 (function2code (s->u.reiserfs_sb.s_hash_function)); + + journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); + journal_end(&th, s, 1) ; + s->s_dirt = 0; + } else { + struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s); + if (strncmp (rs->s_magic, REISER2FS_SUPER_MAGIC_STRING, + strlen ( REISER2FS_SUPER_MAGIC_STRING))) { + reiserfs_warning("reiserfs: using 3.5.x disk format\n") ; + } + } + + init_waitqueue_head (&(s->u.reiserfs_sb.s_wait)); + + printk("%s\n", reiserfs_get_version_string()) ; + return s; + + error: + if (jinit_done) { /* kill the commit thread, free journal ram */ + journal_release_error(NULL, s) ; + } + if (SB_DISK_SUPER_BLOCK (s)) { + for (j = 0; j < SB_BMAP_NR (s); j ++) { + if (SB_AP_BITMAP (s)) + brelse (SB_AP_BITMAP (s)[j]); + } + if (SB_AP_BITMAP (s)) + reiserfs_kfree (SB_AP_BITMAP (s), sizeof (struct buffer_head *) * SB_BMAP_NR (s), s); + } + if (SB_BUFFER_WITH_SB (s)) + brelse(SB_BUFFER_WITH_SB (s)); + + return NULL; +} + + +// +// a portion of this function, particularly the VFS interface portion, +// was derived from minix or ext2's analog and evolved as the +// prototype did. You should be able to tell which portion by looking +// at the ext2 code and comparing. It's subfunctions contain no code +// used as a template unless they are so labeled. +// +int reiserfs_statfs (struct super_block * s, struct statfs * buf) +{ + struct reiserfs_super_block * rs = SB_DISK_SUPER_BLOCK (s); + + /* changed to accomodate gcc folks.*/ + buf->f_type = REISERFS_SUPER_MAGIC; + buf->f_bsize = le32_to_cpu (s->s_blocksize); + buf->f_blocks = le32_to_cpu (rs->s_block_count) - le16_to_cpu (rs->s_bmap_nr) - 1; + buf->f_bfree = le32_to_cpu (rs->s_free_blocks); + buf->f_bavail = buf->f_bfree; + buf->f_files = -1; + buf->f_ffree = -1; + buf->f_namelen = (REISERFS_MAX_NAME_LEN (s->s_blocksize)); + return 0; +} + +#ifdef __KERNEL__ + +static DECLARE_FSTYPE_DEV(reiserfs_fs_type,"reiserfs",reiserfs_read_super); + +// +// this is exactly what 2.3.99-pre9's init_ext2_fs is +// +static int __init init_reiserfs_fs (void) +{ + return register_filesystem(&reiserfs_fs_type); +} + +EXPORT_NO_SYMBOLS; + +// +// this is exactly what 2.3.99-pre9's init_ext2_fs is +// +static void __exit exit_reiserfs_fs(void) +{ + unregister_filesystem(&reiserfs_fs_type); +} + +module_init(init_reiserfs_fs) ; +module_exit(exit_reiserfs_fs) ; + +#endif + + + diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/tail_conversion.c linux/fs/reiserfs/tail_conversion.c --- v2.4.0/linux/fs/reiserfs/tail_conversion.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/tail_conversion.c Mon Jan 15 15:31:19 2001 @@ -0,0 +1,297 @@ +/* + * Copyright 1999 Hans Reiser, see reiserfs/README for licensing and copyright details + */ + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include + +#else + +#include "nokernel.h" + +#endif + + +/* access to tail : when one is going to read tail it must make sure, that is not running. + direct2indirect and indirect2direct can not run concurrently */ + + +/* Converts direct items to an unformatted node. Panics if file has no + tail. -ENOSPC if no disk space for conversion */ +/* path points to first direct item of the file regarless of how many of + them are there */ +int direct2indirect (struct reiserfs_transaction_handle *th, struct inode * inode, + struct path * path, struct buffer_head * unbh, + loff_t tail_offset) +{ + struct super_block * sb = inode->i_sb; + struct buffer_head *up_to_date_bh ; + struct item_head * p_le_ih = PATH_PITEM_HEAD (path); + struct cpu_key end_key; /* Key to search for the last byte of the + converted item. */ + struct item_head ind_ih; /* new indirect item to be inserted or + key of unfm pointer to be pasted */ + int n_blk_size, + n_retval; /* returned value for reiserfs_insert_item and clones */ + struct unfm_nodeinfo unfm_ptr; /* Handle on an unformatted node + that will be inserted in the + tree. */ + + + sb->u.reiserfs_sb.s_direct2indirect ++; + + n_blk_size = sb->s_blocksize; + + /* and key to search for append or insert pointer to the new + unformatted node. */ + copy_item_head (&ind_ih, p_le_ih); + set_le_ih_k_offset (&ind_ih, tail_offset); + set_le_ih_k_type (&ind_ih, TYPE_INDIRECT); + + /* Set the key to search for the place for new unfm pointer */ + make_cpu_key (&end_key, inode, tail_offset, TYPE_INDIRECT, 4); + + // FIXME: we could avoid this + if ( search_for_position_by_key (sb, &end_key, path) == POSITION_FOUND ) + reiserfs_panic (sb, "PAP-14030: direct2indirect: " + "pasted or inserted byte exists in the tree"); + + p_le_ih = PATH_PITEM_HEAD (path); + + unfm_ptr.unfm_nodenum = cpu_to_le32 (unbh->b_blocknr); + unfm_ptr.unfm_freespace = 0; // ??? + + if ( is_statdata_le_ih (p_le_ih) ) { + /* Insert new indirect item. */ + set_ih_free_space (&ind_ih, 0); /* delete at nearest future */ + ind_ih.ih_item_len = cpu_to_le16 (UNFM_P_SIZE); + PATH_LAST_POSITION (path)++; + n_retval = reiserfs_insert_item (th, path, &end_key, &ind_ih, + (char *)&unfm_ptr); + } else { + /* Paste into last indirect item of an object. */ + n_retval = reiserfs_paste_into_item(th, path, &end_key, + (char *)&unfm_ptr, UNFM_P_SIZE); + } + if ( n_retval ) { + return n_retval; + } + + // note: from here there are two keys which have matching first + // three key components. They only differ by the fourth one. + + + /* Set the key to search for the direct items of the file */ + make_cpu_key (&end_key, inode, max_reiserfs_offset (inode), TYPE_DIRECT, 4); + + /* Move bytes from the direct items to the new unformatted node + and delete them. */ + while (1) { + int item_len, first_direct; + + /* end_key.k_offset is set so, that we will always have found + last item of the file */ + if ( search_for_position_by_key (sb, &end_key, path) == POSITION_FOUND ) + reiserfs_panic (sb, "PAP-14050: direct2indirect: " + "direct item (%k) not found", &end_key); + p_le_ih = PATH_PITEM_HEAD (path); +#ifdef CONFIG_REISERFS_CHECK + if (!is_direct_le_ih (p_le_ih)) + reiserfs_panic (sb, "vs-14055: direct2indirect: " + "direct item expected, found %h", p_le_ih); +#endif + if ((le_ih_k_offset (p_le_ih) & (n_blk_size - 1)) == 1) + first_direct = 1; + else + first_direct = 0; + item_len = le16_to_cpu (p_le_ih->ih_item_len); + + /* we only send the unbh pointer if the buffer is not up to date. + ** this avoids overwriting good data from writepage() with old data + ** from the disk or buffer cache + */ + if (buffer_uptodate(unbh) || Page_Uptodate(unbh->b_page)) { + up_to_date_bh = NULL ; + } else { + up_to_date_bh = unbh ; + } + n_retval = reiserfs_delete_item (th, path, &end_key, inode, + up_to_date_bh) ; + + if (first_direct && item_len == n_retval) + // done: file does not have direct items anymore + break; + + } + + inode->u.reiserfs_i.i_first_direct_byte = U32_MAX; + + return 0; +} + + +/* stolen from fs/buffer.c */ +void reiserfs_unmap_buffer(struct buffer_head *bh) { + if (buffer_mapped(bh)) { + if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { + BUG() ; + } + mark_buffer_clean(bh) ; + wait_on_buffer(bh) ; + // clear_bit(BH_Uptodate, &bh->b_state) ; + clear_bit(BH_Mapped, &bh->b_state) ; + clear_bit(BH_Req, &bh->b_state) ; + clear_bit(BH_New, &bh->b_state) ; + } +} + +static void +unmap_buffers(struct page *page, loff_t pos) { + struct buffer_head *bh ; + struct buffer_head *head ; + struct buffer_head *next ; + unsigned long tail_index ; + unsigned long cur_index ; + + if (page) { + if (page->buffers) { + tail_index = pos & (PAGE_CACHE_SIZE - 1) ; + cur_index = 0 ; + head = page->buffers ; + bh = head ; + do { + next = bh->b_this_page ; + + /* we want to unmap the buffers that contain the tail, and + ** all the buffers after it (since the tail must be at the + ** end of the file). We don't want to unmap file data + ** before the tail, since it might be dirty and waiting to + ** reach disk + */ + cur_index += bh->b_size ; + if (cur_index > tail_index) { + reiserfs_unmap_buffer(bh) ; + } + bh = next ; + } while (bh != head) ; + } + } +} + +/* this first locks inode (neither reads nor sync are permitted), + reads tail through page cache, insert direct item. When direct item + inserted successfully inode is left locked. Return value is always + what we expect from it (number of cut bytes). But when tail remains + in the unformatted node, we set mode to SKIP_BALANCING and unlock + inode */ +int indirect2direct (struct reiserfs_transaction_handle *th, + struct inode * p_s_inode, + struct page *page, + struct path * p_s_path, /* path to the indirect item. */ + struct cpu_key * p_s_item_key, /* Key to look for unformatted node pointer to be cut. */ + loff_t n_new_file_size, /* New file size. */ + char * p_c_mode) +{ + struct super_block * p_s_sb = p_s_inode->i_sb; + struct item_head s_ih; + unsigned long n_block_size = p_s_sb->s_blocksize; + char * tail; + int tail_len, round_tail_len; + loff_t pos, pos1; /* position of first byte of the tail */ + struct cpu_key key; + + p_s_sb->u.reiserfs_sb.s_indirect2direct ++; + + *p_c_mode = M_SKIP_BALANCING; + + /* store item head path points to. */ + copy_item_head (&s_ih, PATH_PITEM_HEAD(p_s_path)); + + tail_len = (n_new_file_size & (n_block_size - 1)); + if (!old_format_only (p_s_sb)) + round_tail_len = ROUND_UP (tail_len); + else + round_tail_len = tail_len; + + pos = le_ih_k_offset (&s_ih) - 1 + (le16_to_cpu (s_ih.ih_item_len) / UNFM_P_SIZE - 1) * p_s_sb->s_blocksize; + pos1 = pos; + + // we are protected by i_sem. The tail can not disapper, not + // append can be done either + // we are in truncate or packing tail in file_release + + tail = (char *)kmap(page) ; /* this can schedule */ + + if (path_changed (&s_ih, p_s_path)) { + /* re-search indirect item */ + if ( search_for_position_by_key (p_s_sb, p_s_item_key, p_s_path) == POSITION_NOT_FOUND ) + reiserfs_panic(p_s_sb, "PAP-5520: indirect2direct: " + "item to be converted %k does not exist", p_s_item_key); + copy_item_head(&s_ih, PATH_PITEM_HEAD(p_s_path)); +#ifdef CONFIG_REISERFS_CHECK + pos = le_ih_k_offset (&s_ih) - 1 + + (le16_to_cpu (s_ih.ih_item_len) / UNFM_P_SIZE - 1) * p_s_sb->s_blocksize; + if (pos != pos1) + reiserfs_panic (p_s_sb, "vs-5530: indirect2direct: " + "tail position changed while we were reading it"); +#endif + } + + + /* Set direct item header to insert. */ + make_le_item_head (&s_ih, 0, inode_items_version (p_s_inode), pos1 + 1, + TYPE_DIRECT, round_tail_len, 0xffff/*ih_free_space*/); + + /* we want a pointer to the first byte of the tail in the page. + ** the page was locked and this part of the page was up to date when + ** indirect2direct was called, so we know the bytes are still valid + */ + tail = tail + (pos & (PAGE_CACHE_SIZE - 1)) ; + + PATH_LAST_POSITION(p_s_path)++; + + key = *p_s_item_key; + set_cpu_key_k_type (&key, TYPE_DIRECT); + key.key_length = 4; + /* Insert tail as new direct item in the tree */ + if ( reiserfs_insert_item(th, p_s_path, &key, &s_ih, + tail ? tail : NULL) < 0 ) { + /* No disk memory. So we can not convert last unformatted node + to the direct item. In this case we used to adjust + indirect items's ih_free_space. Now ih_free_space is not + used, it would be ideal to write zeros to corresponding + unformatted node. For now i_size is considered as guard for + going out of file size */ + kunmap(page) ; + return n_block_size - round_tail_len; + } + kunmap(page) ; + + /* this will invalidate all the buffers in the page after + ** pos1 + */ + unmap_buffers(page, pos1) ; + + // note: we have now the same as in above direct2indirect + // conversion: there are two keys which have matching first three + // key components. They only differ by the fouhth one. + + /* We have inserted new direct item and must remove last + unformatted node. */ + p_s_inode->i_blocks += (p_s_sb->s_blocksize / 512); + *p_c_mode = M_CUT; + + /* we store position of first direct item in the in-core inode */ + //mark_file_with_tail (p_s_inode, pos1 + 1); + p_s_inode->u.reiserfs_i.i_first_direct_byte = pos1 + 1; + + return n_block_size - round_tail_len; +} + + + diff -u --recursive --new-file v2.4.0/linux/fs/reiserfs/version.c linux/fs/reiserfs/version.c --- v2.4.0/linux/fs/reiserfs/version.c Wed Dec 31 16:00:00 1969 +++ linux/fs/reiserfs/version.c Mon Jan 15 12:42:32 2001 @@ -0,0 +1,7 @@ +/* + * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README + */ + +char *reiserfs_get_version_string(void) { + return "ReiserFS version 3.6.25" ; +} diff -u --recursive --new-file v2.4.0/linux/include/asm-alpha/errno.h linux/include/asm-alpha/errno.h --- v2.4.0/linux/include/asm-alpha/errno.h Wed Apr 16 14:15:00 1997 +++ linux/include/asm-alpha/errno.h Mon Jan 15 12:42:32 2001 @@ -139,4 +139,6 @@ #define ENOMEDIUM 129 /* No medium found */ #define EMEDIUMTYPE 130 /* Wrong medium type */ +#define EHASHCOLLISION 131 /* Number of hash collisons exceeds maximum generation counter value. */ + #endif diff -u --recursive --new-file v2.4.0/linux/include/asm-i386/bugs.h linux/include/asm-i386/bugs.h --- v2.4.0/linux/include/asm-i386/bugs.h Thu Jan 4 14:50:45 2001 +++ linux/include/asm-i386/bugs.h Mon Jan 15 18:20:19 2001 @@ -76,26 +76,23 @@ } /* Enable FXSR and company _before_ testing for FP problems. */ -#if defined(CONFIG_X86_FXSR) || defined(CONFIG_X86_RUNTIME_FXSR) /* * Verify that the FXSAVE/FXRSTOR data will be 16-byte aligned. */ - if (offsetof(struct task_struct, thread.i387.fxsave) & 15) - panic("Kernel compiled for PII/PIII+ with FXSR, data not 16-byte aligned!"); - + if (offsetof(struct task_struct, thread.i387.fxsave) & 15) { + extern void __buggy_fxsr_alignment(void); + __buggy_fxsr_alignment(); + } if (cpu_has_fxsr) { printk(KERN_INFO "Enabling fast FPU save and restore... "); set_in_cr4(X86_CR4_OSFXSR); printk("done.\n"); } -#endif -#ifdef CONFIG_X86_XMM if (cpu_has_xmm) { printk(KERN_INFO "Enabling unmasked SIMD FPU exception support... "); set_in_cr4(X86_CR4_OSXMMEXCPT); printk("done.\n"); } -#endif /* Test for the divl bug.. */ __asm__("fninit\n\t" @@ -202,14 +199,6 @@ && boot_cpu_data.x86_model == 2 && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11)) panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!"); -#endif - -/* - * If we configured ourselves for FXSR, we'd better have it. - */ -#ifdef CONFIG_X86_FXSR - if (!cpu_has_fxsr) - panic("Kernel compiled for PII/PIII+, requires FXSR feature!"); #endif } diff -u --recursive --new-file v2.4.0/linux/include/asm-i386/errno.h linux/include/asm-i386/errno.h --- v2.4.0/linux/include/asm-i386/errno.h Mon Apr 14 16:28:18 1997 +++ linux/include/asm-i386/errno.h Mon Jan 15 12:42:32 2001 @@ -128,5 +128,6 @@ #define ENOMEDIUM 123 /* No medium found */ #define EMEDIUMTYPE 124 /* Wrong medium type */ +#define EHASHCOLLISION 125 /* Number of hash collisons exceeds maximum generation counter value. */ #endif diff -u --recursive --new-file v2.4.0/linux/include/asm-i386/i387.h linux/include/asm-i386/i387.h --- v2.4.0/linux/include/asm-i386/i387.h Thu Jan 4 14:52:01 2001 +++ linux/include/asm-i386/i387.h Mon Jan 15 17:26:26 2001 @@ -23,6 +23,10 @@ extern void save_init_fpu( struct task_struct *tsk ); extern void restore_fpu( struct task_struct *tsk ); +extern void kernel_fpu_begin(void); +#define kernel_fpu_end() stts() + + #define unlazy_fpu( tsk ) do { \ if ( tsk->flags & PF_USEDFPU ) \ save_init_fpu( tsk ); \ @@ -50,10 +54,8 @@ extern void set_fpu_mxcsr( struct task_struct *tsk, unsigned short mxcsr ); #define load_mxcsr( val ) do { \ - if ( cpu_has_xmm ) { \ - unsigned long __mxcsr = ((unsigned long)(val) & 0xffff); \ - asm volatile( "ldmxcsr %0" : : "m" (__mxcsr) ); \ - } \ + unsigned long __mxcsr = ((unsigned long)(val) & 0xffbf); \ + asm volatile( "ldmxcsr %0" : : "m" (__mxcsr) ); \ } while (0) /* diff -u --recursive --new-file v2.4.0/linux/include/asm-i386/pgtable.h linux/include/asm-i386/pgtable.h --- v2.4.0/linux/include/asm-i386/pgtable.h Thu Jan 4 14:50:46 2001 +++ linux/include/asm-i386/pgtable.h Mon Jan 15 17:25:05 2001 @@ -140,7 +140,11 @@ #define VMALLOC_START (((unsigned long) high_memory + 2*VMALLOC_OFFSET-1) & \ ~(VMALLOC_OFFSET-1)) #define VMALLOC_VMADDR(x) ((unsigned long)(x)) -#define VMALLOC_END (FIXADDR_START) +#if CONFIG_HIGHMEM +# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) +#else +# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) +#endif /* * The 4MB page is guessing.. Detailed in the infamous "Chapter H" diff -u --recursive --new-file v2.4.0/linux/include/asm-i386/system.h linux/include/asm-i386/system.h --- v2.4.0/linux/include/asm-i386/system.h Thu Jan 4 14:50:46 2001 +++ linux/include/asm-i386/system.h Mon Jan 15 17:25:04 2001 @@ -267,15 +267,8 @@ * I expect future Intel CPU's to have a weaker ordering, * but I'd also expect them to finally get their act together * and add some real memory barriers if so. - * - * The Pentium III does add a real memory barrier with the - * sfence instruction, so we use that where appropriate. */ -#ifndef CONFIG_X86_XMM #define mb() __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory") -#else -#define mb() __asm__ __volatile__ ("sfence": : :"memory") -#endif #define rmb() mb() #define wmb() __asm__ __volatile__ ("": : :"memory") diff -u --recursive --new-file v2.4.0/linux/include/linux/blk.h linux/include/linux/blk.h --- v2.4.0/linux/include/linux/blk.h Thu Jan 4 14:50:47 2001 +++ linux/include/linux/blk.h Mon Jan 15 17:25:39 2001 @@ -87,10 +87,6 @@ static inline void blkdev_dequeue_request(struct request * req) { - if (req->e) { - req->e->dequeue_fn(req); - req->e = NULL; - } list_del(&req->queue); } diff -u --recursive --new-file v2.4.0/linux/include/linux/blkdev.h linux/include/linux/blkdev.h --- v2.4.0/linux/include/linux/blkdev.h Thu Jan 4 14:50:47 2001 +++ linux/include/linux/blkdev.h Mon Jan 15 17:25:28 2001 @@ -23,8 +23,6 @@ int elevator_sequence; struct list_head table; - struct list_head *free_list; - volatile int rq_status; /* should split this into a few status bits */ #define RQ_INACTIVE (-1) #define RQ_ACTIVE 1 @@ -47,7 +45,6 @@ struct buffer_head * bh; struct buffer_head * bhtail; request_queue_t *q; - elevator_t *e; }; #include @@ -67,9 +64,10 @@ typedef void (unplug_device_fn) (void *q); /* - * Default nr free requests per queue + * Default nr free requests per queue, ll_rw_blk will scale it down + * according to available RAM at init time */ -#define QUEUE_NR_REQUESTS 256 +#define QUEUE_NR_REQUESTS 8192 struct request_queue { @@ -77,6 +75,8 @@ * the queue request freelist, one for reads and one for writes */ struct list_head request_freelist[2]; + struct list_head pending_freelist[2]; + int pending_free[2]; /* * Together with queue_head for cacheline sharing @@ -116,7 +116,7 @@ * Is meant to protect the queue in the future instead of * io_request_lock */ - spinlock_t request_lock; + spinlock_t queue_lock; /* * Tasks wait here for free request @@ -152,6 +152,7 @@ extern void register_disk(struct gendisk *dev, kdev_t first, unsigned minors, struct block_device_operations *ops, long size); extern void generic_make_request(int rw, struct buffer_head * bh); extern request_queue_t *blk_get_queue(kdev_t dev); +extern inline request_queue_t *__blk_get_queue(kdev_t dev); extern void blkdev_release_request(struct request *); /* @@ -162,6 +163,7 @@ extern void blk_queue_headactive(request_queue_t *, int); extern void blk_queue_pluggable(request_queue_t *, plug_device_fn *); extern void blk_queue_make_request(request_queue_t *, make_request_fn *); +extern void generic_unplug_device(void *); extern int * blk_size[MAX_BLKDEV]; @@ -175,9 +177,10 @@ extern int * max_segments[MAX_BLKDEV]; -#define MAX_SECTORS 254 +extern atomic_t queued_sectors; -#define MAX_SEGMENTS MAX_SECTORS +#define MAX_SEGMENTS 128 +#define MAX_SECTORS (MAX_SEGMENTS*8) #define PageAlignSize(size) (((size) + PAGE_SIZE -1) & PAGE_MASK) @@ -203,5 +206,14 @@ return 512; } +#define blk_finished_io(nsects) \ + atomic_sub(nsects, &queued_sectors); \ + if (atomic_read(&queued_sectors) < 0) { \ + printk("block: queued_sectors < 0\n"); \ + atomic_set(&queued_sectors, 0); \ + } + +#define blk_started_io(nsects) \ + atomic_add(nsects, &queued_sectors); #endif diff -u --recursive --new-file v2.4.0/linux/include/linux/elevator.h linux/include/linux/elevator.h --- v2.4.0/linux/include/linux/elevator.h Tue Jul 18 21:43:10 2000 +++ linux/include/linux/elevator.h Mon Jan 15 13:08:15 2001 @@ -7,34 +7,32 @@ struct list_head *, struct list_head *, int); -typedef int (elevator_merge_fn) (request_queue_t *, struct request **, - struct buffer_head *, int, int *, int *); +typedef int (elevator_merge_fn) (request_queue_t *, struct request **, struct list_head *, + struct buffer_head *, int, int, int); -typedef void (elevator_dequeue_fn) (struct request *); +typedef void (elevator_merge_cleanup_fn) (request_queue_t *, struct request *, int); + +typedef void (elevator_merge_req_fn) (struct request *, struct request *); struct elevator_s { - int sequence; - int read_latency; int write_latency; - int max_bomb_segments; - unsigned int nr_segments; - int read_pendings; - - elevator_fn * elevator_fn; elevator_merge_fn *elevator_merge_fn; - elevator_dequeue_fn *dequeue_fn; + elevator_merge_cleanup_fn *elevator_merge_cleanup_fn; + elevator_merge_req_fn *elevator_merge_req_fn; unsigned int queue_ID; }; -void elevator_noop(struct request *, elevator_t *, struct list_head *, struct list_head *, int); -int elevator_noop_merge(request_queue_t *, struct request **, struct buffer_head *, int, int *, int *); -void elevator_noop_dequeue(struct request *); -void elevator_linus(struct request *, elevator_t *, struct list_head *, struct list_head *, int); -int elevator_linus_merge(request_queue_t *, struct request **, struct buffer_head *, int, int *, int *); +int elevator_noop_merge(request_queue_t *, struct request **, struct list_head *, struct buffer_head *, int, int, int); +void elevator_noop_merge_cleanup(request_queue_t *, struct request *, int); +void elevator_noop_merge_req(struct request *, struct request *); + +int elevator_linus_merge(request_queue_t *, struct request **, struct list_head *, struct buffer_head *, int, int, int); +void elevator_linus_merge_cleanup(request_queue_t *, struct request *, int); +void elevator_linus_merge_req(struct request *, struct request *); typedef struct blkelv_ioctl_arg_s { int queue_ID; @@ -69,6 +67,10 @@ (s1)->sector < (s2)->sector)) || \ (s1)->rq_dev < (s2)->rq_dev) +#define BHRQ_IN_ORDER(bh, rq) \ + (((bh)->b_rdev == (rq)->rq_dev && \ + (bh)->b_rsector < (rq)->sector)) + static inline int elevator_request_latency(elevator_t * elevator, int rw) { int latency; @@ -80,36 +82,24 @@ return latency; } -#define ELEVATOR_NOOP \ -((elevator_t) { \ - 0, /* sequence */ \ - \ - 0, /* read_latency */ \ - 0, /* write_latency */ \ - 0, /* max_bomb_segments */ \ - \ - 0, /* nr_segments */ \ - 0, /* read_pendings */ \ - \ - elevator_noop, /* elevator_fn */ \ - elevator_noop_merge, /* elevator_merge_fn */ \ - elevator_noop_dequeue, /* dequeue_fn */ \ +#define ELEVATOR_NOOP \ +((elevator_t) { \ + 0, /* read_latency */ \ + 0, /* write_latency */ \ + \ + elevator_noop_merge, /* elevator_merge_fn */ \ + elevator_noop_merge_cleanup, /* elevator_merge_cleanup_fn */ \ + elevator_noop_merge_req, /* elevator_merge_req_fn */ \ }) -#define ELEVATOR_LINUS \ -((elevator_t) { \ - 0, /* not used */ \ - \ - 1000000, /* read passovers */ \ - 2000000, /* write passovers */ \ - 0, /* max_bomb_segments */ \ - \ - 0, /* not used */ \ - 0, /* not used */ \ - \ - elevator_linus, /* elevator_fn */ \ - elevator_linus_merge, /* elevator_merge_fn */ \ - elevator_noop_dequeue, /* dequeue_fn */ \ +#define ELEVATOR_LINUS \ +((elevator_t) { \ + 8192, /* read passovers */ \ + 16384, /* write passovers */ \ + \ + elevator_linus_merge, /* elevator_merge_fn */ \ + elevator_linus_merge_cleanup, /* elevator_merge_cleanup_fn */ \ + elevator_linus_merge_req, /* elevator_merge_req_fn */ \ }) #endif diff -u --recursive --new-file v2.4.0/linux/include/linux/fs.h linux/include/linux/fs.h --- v2.4.0/linux/include/linux/fs.h Thu Jan 4 14:50:47 2001 +++ linux/include/linux/fs.h Mon Jan 15 17:25:05 2001 @@ -288,6 +288,7 @@ #include #include #include +#include #include #include #include @@ -450,6 +451,7 @@ struct hfs_inode_info hfs_i; struct adfs_inode_info adfs_i; struct qnx4_inode_info qnx4_i; + struct reiserfs_inode_info reiserfs_i; struct bfs_inode_info bfs_i; struct udf_inode_info udf_i; struct ncp_inode_info ncpfs_i; @@ -460,35 +462,6 @@ } u; }; -/* Inode state bits.. */ -#define I_DIRTY_SYNC 1 /* Not dirty enough for O_DATASYNC */ -#define I_DIRTY_DATASYNC 2 /* Data-related inode changes pending */ -#define I_DIRTY_PAGES 4 /* Data-related inode changes pending */ -#define I_LOCK 8 -#define I_FREEING 16 -#define I_CLEAR 32 - -#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) - -extern void __mark_inode_dirty(struct inode *, int); -static inline void mark_inode_dirty(struct inode *inode) -{ - if ((inode->i_state & I_DIRTY) != I_DIRTY) - __mark_inode_dirty(inode, I_DIRTY); -} - -static inline void mark_inode_dirty_sync(struct inode *inode) -{ - if (!(inode->i_state & I_DIRTY_SYNC)) - __mark_inode_dirty(inode, I_DIRTY_SYNC); -} - -static inline void mark_inode_dirty_pages(struct inode *inode) -{ - if (inode && !(inode->i_state & I_DIRTY_PAGES)) - __mark_inode_dirty(inode, I_DIRTY_PAGES); -} - struct fown_struct { int pid; /* pid or -pgrp where SIGIO should be sent */ uid_t uid, euid; /* uid/euid of process setting the owner */ @@ -654,6 +627,7 @@ #include #include #include +#include #include #include #include @@ -702,6 +676,7 @@ struct hfs_sb_info hfs_sb; struct adfs_sb_info adfs_sb; struct qnx4_sb_info qnx4_sb; + struct reiserfs_sb_info reiserfs_sb; struct bfs_sb_info bfs_sb; struct udf_sb_info udf_sb; struct ncp_sb_info ncpfs_sb; @@ -815,17 +790,54 @@ */ struct super_operations { void (*read_inode) (struct inode *); + + /* reiserfs kludge. reiserfs needs 64 bits of information to + ** find an inode. We are using the read_inode2 call to get + ** that information. We don't like this, and are waiting on some + ** VFS changes for the real solution. + ** iget4 calls read_inode2, iff it is defined + */ + void (*read_inode2) (struct inode *, void *) ; + void (*dirty_inode) (struct inode *); void (*write_inode) (struct inode *, int); void (*put_inode) (struct inode *); void (*delete_inode) (struct inode *); void (*put_super) (struct super_block *); void (*write_super) (struct super_block *); + void (*write_super_lockfs) (struct super_block *); + void (*unlockfs) (struct super_block *); int (*statfs) (struct super_block *, struct statfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*clear_inode) (struct inode *); void (*umount_begin) (struct super_block *); }; +/* Inode state bits.. */ +#define I_DIRTY_SYNC 1 /* Not dirty enough for O_DATASYNC */ +#define I_DIRTY_DATASYNC 2 /* Data-related inode changes pending */ +#define I_DIRTY_PAGES 4 /* Data-related inode changes pending */ +#define I_LOCK 8 +#define I_FREEING 16 +#define I_CLEAR 32 + +#define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES) + +extern void __mark_inode_dirty(struct inode *, int); +static inline void mark_inode_dirty(struct inode *inode) +{ + __mark_inode_dirty(inode, I_DIRTY); +} + +static inline void mark_inode_dirty_sync(struct inode *inode) +{ + __mark_inode_dirty(inode, I_DIRTY_SYNC); +} + +static inline void mark_inode_dirty_pages(struct inode *inode) +{ + __mark_inode_dirty(inode, I_DIRTY_PAGES); +} + struct dquot_operations { void (*initialize) (struct inode *, short); void (*drop) (struct inode *); @@ -987,6 +999,9 @@ extern int try_to_free_buffers(struct page *, int); extern void refile_buffer(struct buffer_head * buf); + +/* reiserfs_writepage needs this */ +extern void set_buffer_async_io(struct buffer_head *bh) ; #define BUF_CLEAN 0 #define BUF_LOCKED 1 /* Buffers scheduled for write */ diff -u --recursive --new-file v2.4.0/linux/include/linux/mm.h linux/include/linux/mm.h --- v2.4.0/linux/include/linux/mm.h Thu Jan 4 14:50:47 2001 +++ linux/include/linux/mm.h Mon Jan 15 17:25:05 2001 @@ -464,6 +464,7 @@ #else #define __GFP_HIGHMEM 0x0 /* noop */ #endif +#define __GFP_VM 0x20 #define GFP_BUFFER (__GFP_HIGH | __GFP_WAIT) diff -u --recursive --new-file v2.4.0/linux/include/linux/reiserfs_fs.h linux/include/linux/reiserfs_fs.h --- v2.4.0/linux/include/linux/reiserfs_fs.h Wed Dec 31 16:00:00 1969 +++ linux/include/linux/reiserfs_fs.h Mon Jan 15 13:23:01 2001 @@ -0,0 +1,2074 @@ +/* + * Copyright 1996, 1997, 1998 Hans Reiser, see reiserfs/README for licensing and copyright details + */ + + /* this file has an amazingly stupid + name, yura please fix it to be + reiserfs.h, and merge all the rest + of our .h files that are in this + directory into it. */ + + +#ifndef _LINUX_REISER_FS_H +#define _LINUX_REISER_FS_H + + +#include +#ifdef __KERNEL__ +#include +#include +#endif + +/* + * include/linux/reiser_fs.h + * + * Reiser File System constants and structures + * + */ + +/* in reading the #defines, it may help to understand that they employ + the following abbreviations: + + B = Buffer + I = Item header + H = Height within the tree (should be changed to LEV) + N = Number of the item in the node + STAT = stat data + DEH = Directory Entry Header + EC = Entry Count + E = Entry number + UL = Unsigned Long + BLKH = BLocK Header + UNFM = UNForMatted node + DC = Disk Child + P = Path + + These #defines are named by concatenating these abbreviations, + where first comes the arguments, and last comes the return value, + of the macro. + +*/ + + /* Vladimir, what is the story with + new_get_new_buffer nowadays? I + want a complete explanation written + here. */ + +/* NEW_GET_NEW_BUFFER will try to allocate new blocks better */ +/*#define NEW_GET_NEW_BUFFER*/ +#define OLD_GET_NEW_BUFFER + + /* Vladimir, what about this one too? */ +/* if this is undefined, all inode changes get into stat data immediately, if it can be found in RAM */ +#define DIRTY_LATER + +/* enable journalling */ +#define ENABLE_JOURNAL + +#ifdef __KERNEL__ + +/* #define REISERFS_CHECK */ + +#define REISERFS_PREALLOCATE +#endif +#define PREALLOCATION_SIZE 8 + +/* if this is undefined, all inode changes get into stat data + immediately, if it can be found in RAM */ +#define DIRTY_LATER + + +/*#define READ_LOCK_REISERFS*/ + + +/* n must be power of 2 */ +#define _ROUND_UP(x,n) (((x)+(n)-1u) & ~((n)-1u)) + +// to be ok for alpha and others we have to align structures to 8 byte +// boundary. +// FIXME: do not change 4 by anything else: there is code which relies on that + /* what 4? -Hans */ +#define ROUND_UP(x) _ROUND_UP(x,8LL) + +/* debug levels. Right now, CONFIG_REISERFS_CHECK means print all debug +** messages. +*/ +#define REISERFS_DEBUG_CODE 5 /* extra messages to help find/debug errors */ + +/* + * Disk Data Structures + */ + +/***************************************************************************/ +/* SUPER BLOCK */ +/***************************************************************************/ + +/* + * Structure of super block on disk, a version of which in RAM is often accessed as s->u.reiserfs_sb.s_rs + * the version in RAM is part of a larger structure containing fields never written to disk. + */ + + /* used by gcc */ +#define REISERFS_SUPER_MAGIC 0x52654973 + /* used by file system utilities that + look at the superblock, etc. */ +#define REISERFS_SUPER_MAGIC_STRING "ReIsErFs" +#define REISER2FS_SUPER_MAGIC_STRING "ReIsEr2Fs" + +extern inline int is_reiserfs_magic_string (struct reiserfs_super_block * rs) +{ + return (!strncmp (rs->s_magic, REISERFS_SUPER_MAGIC_STRING, + strlen ( REISERFS_SUPER_MAGIC_STRING)) || + !strncmp (rs->s_magic, REISER2FS_SUPER_MAGIC_STRING, + strlen ( REISER2FS_SUPER_MAGIC_STRING))); +} + + /* ReiserFS leaves the first 64k unused, + so that partition labels have enough + space. If someone wants to write a + fancy bootloader that needs more than + 64k, let us know, and this will be + increased in size. This number must + be larger than than the largest block + size on any platform, or code will + break. -Hans */ +#define REISERFS_DISK_OFFSET_IN_BYTES (64 * 1024) +#define REISERFS_FIRST_BLOCK unused_define + +/* the spot for the super in versions 3.5 - 3.5.10 (inclusive) */ +#define REISERFS_OLD_DISK_OFFSET_IN_BYTES (8 * 1024) + + +// reiserfs internal error code (used by search_by_key adn fix_nodes)) +#define CARRY_ON 0 +#define REPEAT_SEARCH -1 +#define IO_ERROR -2 +#define NO_DISK_SPACE -3 +#define NO_BALANCING_NEEDED (-4) +#define NO_MORE_UNUSED_CONTIGUOUS_BLOCKS (-5) + +//#define SCHEDULE_OCCURRED 1 +//#define PATH_INCORRECT 2 + +//#define NO_DISK_SPACE (-1) + + + +typedef unsigned long b_blocknr_t; +typedef __u32 unp_t; + + /* who is responsible for this + completely uncommented struct? */ +struct unfm_nodeinfo { + /* This is what? */ + unp_t unfm_nodenum; + /* now this I know what it is, and + most of the people on our project + know what it is, but I bet nobody + new I hire will have a clue. */ + unsigned short unfm_freespace; +}; + + +/* when reiserfs_file_write is called with a byte count >= MIN_PACK_ON_CLOSE, +** it sets the inode to pack on close, and when extending the file, will only +** use unformatted nodes. +** +** This is a big speed up for the journal, which is badly hurt by direct->indirect +** conversions (they must be logged). +*/ +#define MIN_PACK_ON_CLOSE 512 + +/* the defines below say, that if file size is >= + DIRECT_TAIL_SUPPRESSION_SIZE * blocksize, then if tail is longer + than MAX_BYTES_SUPPRESS_DIRECT_TAIL, it will be stored in + unformatted node */ +#define DIRECT_TAIL_SUPPRESSION_SIZE 1024 +#define MAX_BYTES_SUPPRESS_DIRECT_TAIL 1024 + +#if 0 + +// +#define mark_file_with_tail(inode,offset) \ +{\ +inode->u.reiserfs_i.i_has_tail = 1;\ +} + +#define mark_file_without_tail(inode) \ +{\ +inode->u.reiserfs_i.i_has_tail = 0;\ +} + +#endif + +// this says about version of all items (but stat data) the object +// consists of +#define inode_items_version(inode) ((inode)->u.reiserfs_i.i_version) + + +/* We store tail in unformatted node if it is too big to fit into a + formatted node or if DIRECT_TAIL_SUPPRESSION_SIZE, + MAX_BYTES_SUPPRESS_DIRECT_TAIL and file size say that. */ +/* #define STORE_TAIL_IN_UNFM(n_file_size,n_tail_size,n_block_size) \ */ +/* ( ((n_tail_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) || \ */ +/* ( ( (n_file_size) >= (n_block_size) * DIRECT_TAIL_SUPPRESSION_SIZE ) && \ */ +/* ( (n_tail_size) >= MAX_BYTES_SUPPRESS_DIRECT_TAIL ) ) ) */ + + /* This is an aggressive tail suppression policy, I am hoping it + improves our benchmarks. The principle behind it is that + percentage space saving is what matters, not absolute space + saving. This is non-intuitive, but it helps to understand it if + you consider that the cost to access 4 blocks is not much more + than the cost to access 1 block, if you have to do a seek and + rotate. A tail risks a non-linear disk access that is + significant as a percentage of total time cost for a 4 block file + and saves an amount of space that is less significant as a + percentage of space, or so goes the hypothesis. -Hans */ +#define STORE_TAIL_IN_UNFM(n_file_size,n_tail_size,n_block_size) \ +(\ + (!(n_tail_size)) || \ + (((n_tail_size) > MAX_DIRECT_ITEM_LEN(n_block_size)) || \ + ( (n_file_size) >= (n_block_size) * 4 ) || \ + ( ( (n_file_size) >= (n_block_size) * 3 ) && \ + ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size))/4) ) || \ + ( ( (n_file_size) >= (n_block_size) * 2 ) && \ + ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size))/2) ) || \ + ( ( (n_file_size) >= (n_block_size) ) && \ + ( (n_tail_size) >= (MAX_DIRECT_ITEM_LEN(n_block_size) * 3)/4) ) ) \ +) + + +/* + * values for s_state field + */ +#define REISERFS_VALID_FS 1 +#define REISERFS_ERROR_FS 2 + + + +/***************************************************************************/ +/* KEY & ITEM HEAD */ +/***************************************************************************/ + +// +// we do support for old format of reiserfs: the problem is to +// distinuquish keys with 32 bit offset and keys with 60 bit ones. On +// leaf level we use ih_version of struct item_head (was +// ih_reserved). For all old items it is set to 0 +// (ITEM_VERSION_1). For new items it is ITEM_VERSION_2. On internal +// levels we have to know version of item key belongs to. +// +#define ITEM_VERSION_1 0 +#define ITEM_VERSION_2 1 + + +/* loff_t - long long */ + + +// +// directories use this key as well as old files +// +struct offset_v1 { + __u32 k_offset; + __u32 k_uniqueness; +} __attribute__ ((__packed__)); + +struct offset_v2 { + __u64 k_offset:60; + __u64 k_type: 4; +} __attribute__ ((__packed__)); + + + +/* Key of an item determines its location in the S+tree, and + is composed of 4 components */ +struct key { + __u32 k_dir_id; /* packing locality: by default parent + directory object id */ + __u32 k_objectid; /* object identifier */ + union { + struct offset_v1 k_offset_v1; + struct offset_v2 k_offset_v2; + } __attribute__ ((__packed__)) u; +} __attribute__ ((__packed__)); + + +struct cpu_key { + struct key on_disk_key; + int version; + int key_length; /* 3 in all cases but direct2indirect and + indirect2direct conversion */ +}; + + + + + + + + /* Our function for comparing keys can compare keys of different + lengths. It takes as a parameter the length of the keys it is to + compare. These defines are used in determining what is to be + passed to it as that parameter. */ +#define REISERFS_FULL_KEY_LEN 4 + +#define REISERFS_SHORT_KEY_LEN 2 + +/* The result of the key compare */ +#define FIRST_GREATER 1 +#define SECOND_GREATER -1 +#define KEYS_IDENTICAL 0 +#define KEY_FOUND 1 +#define KEY_NOT_FOUND 0 + + +#define KEY_SIZE (sizeof(struct key)) +#define SHORT_KEY_SIZE (sizeof (__u32) + sizeof (__u32)) + +/* return values for search_by_key and clones */ +#define ITEM_FOUND 1 +#define ITEM_NOT_FOUND 0 +#define ENTRY_FOUND 1 +#define ENTRY_NOT_FOUND 0 +#define DIRECTORY_NOT_FOUND -1 +#define REGULAR_FILE_FOUND -2 +#define DIRECTORY_FOUND -3 +#define BYTE_FOUND 1 +#define BYTE_NOT_FOUND 0 +#define FILE_NOT_FOUND -1 + +#define POSITION_FOUND 1 +#define POSITION_NOT_FOUND 0 + +// return values for reiserfs_find_entry and search_by_entry_key +#define NAME_FOUND 1 +#define NAME_NOT_FOUND 0 +#define GOTO_PREVIOUS_ITEM 2 +#define NAME_FOUND_INVISIBLE 3 + + + +/* Everything in the filesystem is stored as a set of items. The + item head contains the key of the item, its free space (for + indirect items) and specifies the location of the item itself + within the block. */ + +struct item_head +{ + struct key ih_key; /* Everything in the tree is found by searching for it based on its key.*/ + + /* This is bloat, this should be part + of the item not the item + header. -Hans */ + union { + __u16 ih_free_space_reserved; /* The free space in the last unformatted node of an indirect item if this + is an indirect item. This equals 0xFFFF iff this is a direct item or + stat data item. Note that the key, not this field, is used to determine + the item type, and thus which field this union contains. */ + __u16 ih_entry_count; /* Iff this is a directory item, this field equals the number of directory + entries in the directory item. */ + } __attribute__ ((__packed__)) u; + __u16 ih_item_len; /* total size of the item body */ + __u16 ih_item_location; /* an offset to the item body within the block */ + /* I thought we were going to use this + for having lots of item types? Why + don't you use this for item type + not item version. That is how you + talked me into this field a year + ago, remember? I am still not + convinced it needs to be 16 bits + (for at least many years), but at + least I can sympathize with that + hope. Change the name from version + to type, and tell people not to use + FFFF in case 16 bits is someday too + small and needs to be extended:-). */ + __u16 ih_version; /* 0 for all old items, 2 for new + ones. Highest bit is set by fsck + temporary, cleaned after all done */ +} __attribute__ ((__packed__)); +/* size of item header */ +#define IH_SIZE (sizeof(struct item_head)) + +#define ih_free_space(ih) le16_to_cpu((ih)->u.ih_free_space_reserved) +#define ih_version(ih) le16_to_cpu((ih)->ih_version) +#define ih_entry_count(ih) le16_to_cpu((ih)->u.ih_entry_count) +#define ih_location(ih) le16_to_cpu((ih)->ih_item_location) +#define ih_item_len(ih) le16_to_cpu((ih)->ih_item_len) + +#define put_ih_free_space(ih, val) do { (ih)->u.ih_free_space_reserved = cpu_to_le16(val); } while(0) +#define put_ih_version(ih, val) do { (ih)->ih_version = cpu_to_le16(val); } while (0) +#define put_ih_entry_count(ih, val) do { (ih)->u.ih_entry_count = cpu_to_le16(val); } while (0) +#define put_ih_location(ih, val) do { (ih)->ih_item_location = cpu_to_le16(val); } while (0) +#define put_ih_item_len(ih, val) do { (ih)->ih_item_len = cpu_to_le16(val); } while (0) + + +// FIXME: now would that work for other than i386 archs +#define unreachable_item(ih) (ih->ih_version & (1 << 15)) + +#define get_ih_free_space(ih) (ih_version (ih) == ITEM_VERSION_2 ? 0 : ih_free_space (ih)) +#define set_ih_free_space(ih,val) put_ih_free_space((ih), ((ih_version(ih) == ITEM_VERSION_2) ? 0 : (val))) + + +// +// there are 5 item types currently +// +#define TYPE_STAT_DATA 0 +#define TYPE_INDIRECT 1 +#define TYPE_DIRECT 2 +#define TYPE_DIRENTRY 3 +#define TYPE_ANY 15 // FIXME: comment is required + +// +// in old version uniqueness field shows key type +// +#define V1_SD_UNIQUENESS 0 +#define V1_INDIRECT_UNIQUENESS 0xfffffffe +#define V1_DIRECT_UNIQUENESS 0xffffffff +#define V1_DIRENTRY_UNIQUENESS 500 +#define V1_ANY_UNIQUENESS 555 // FIXME: comment is required + +// +// here are conversion routines +// +extern inline int uniqueness2type (__u32 uniqueness) +{ + switch (uniqueness) { + case V1_SD_UNIQUENESS: return TYPE_STAT_DATA; + case V1_INDIRECT_UNIQUENESS: return TYPE_INDIRECT; + case V1_DIRECT_UNIQUENESS: return TYPE_DIRECT; + case V1_DIRENTRY_UNIQUENESS: return TYPE_DIRENTRY; + } +/* + if (uniqueness != V1_ANY_UNIQUENESS) { + printk ("uniqueness %d\n", uniqueness); + BUG (); + } +*/ + return TYPE_ANY; +} + +extern inline __u32 type2uniqueness (int type) +{ + switch (type) { + case TYPE_STAT_DATA: return V1_SD_UNIQUENESS; + case TYPE_INDIRECT: return V1_INDIRECT_UNIQUENESS; + case TYPE_DIRECT: return V1_DIRECT_UNIQUENESS; + case TYPE_DIRENTRY: return V1_DIRENTRY_UNIQUENESS; + } + /* + if (type != TYPE_ANY) + BUG (); + */ + return V1_ANY_UNIQUENESS; +} + + +// +// key is pointer to on disk key which is stored in le, result is cpu, +// there is no way to get version of object from key, so, provide +// version to these defines +// +extern inline loff_t le_key_k_offset (int version, struct key * key) +{ + return (version == ITEM_VERSION_1) ? key->u.k_offset_v1.k_offset : + le64_to_cpu (key->u.k_offset_v2.k_offset); +} +extern inline loff_t le_ih_k_offset (struct item_head * ih) +{ + return le_key_k_offset (ih_version (ih), &(ih->ih_key)); +} + + +extern inline loff_t le_key_k_type (int version, struct key * key) +{ + return (version == ITEM_VERSION_1) ? uniqueness2type (key->u.k_offset_v1.k_uniqueness) : + le16_to_cpu (key->u.k_offset_v2.k_type); +} +extern inline loff_t le_ih_k_type (struct item_head * ih) +{ + return le_key_k_type (ih_version (ih), &(ih->ih_key)); +} + + +extern inline void set_le_key_k_offset (int version, struct key * key, loff_t offset) +{ + (version == ITEM_VERSION_1) ? (key->u.k_offset_v1.k_offset = offset) : + (key->u.k_offset_v2.k_offset = cpu_to_le64 (offset)); +} +extern inline void set_le_ih_k_offset (struct item_head * ih, loff_t offset) +{ + set_le_key_k_offset (ih_version (ih), &(ih->ih_key), offset); +} + + + +extern inline void set_le_key_k_type (int version, struct key * key, int type) +{ + (version == ITEM_VERSION_1) ? (key->u.k_offset_v1.k_uniqueness = type2uniqueness (type)) : + (key->u.k_offset_v2.k_type = cpu_to_le16 (type)); +} +extern inline void set_le_ih_k_type (struct item_head * ih, int type) +{ + set_le_key_k_type (ih_version (ih), &(ih->ih_key), type); +} + + +#define is_direntry_le_key(version,key) (le_key_k_type (version, key) == TYPE_DIRENTRY) +#define is_direct_le_key(version,key) (le_key_k_type (version, key) == TYPE_DIRECT) +#define is_indirect_le_key(version,key) (le_key_k_type (version, key) == TYPE_INDIRECT) +#define is_statdata_le_key(version,key) (le_key_k_type (version, key) == TYPE_STAT_DATA) + +// +// item header has version. +// +#define is_direntry_le_ih(ih) is_direntry_le_key (ih_version (ih), &((ih)->ih_key)) +#define is_direct_le_ih(ih) is_direct_le_key (ih_version (ih), &((ih)->ih_key)) +#define is_indirect_le_ih(ih) is_indirect_le_key (ih_version(ih), &((ih)->ih_key)) +#define is_statdata_le_ih(ih) is_statdata_le_key (ih_version (ih), &((ih)->ih_key)) + + + +// +// key is pointer to cpu key, result is cpu +// +extern inline loff_t cpu_key_k_offset (struct cpu_key * key) +{ + return (key->version == ITEM_VERSION_1) ? key->on_disk_key.u.k_offset_v1.k_offset : + key->on_disk_key.u.k_offset_v2.k_offset; +} + +extern inline loff_t cpu_key_k_type (struct cpu_key * key) +{ + return (key->version == ITEM_VERSION_1) ? uniqueness2type (key->on_disk_key.u.k_offset_v1.k_uniqueness) : + key->on_disk_key.u.k_offset_v2.k_type; +} + +extern inline void set_cpu_key_k_offset (struct cpu_key * key, loff_t offset) +{ + (key->version == ITEM_VERSION_1) ? (key->on_disk_key.u.k_offset_v1.k_offset = offset) : + (key->on_disk_key.u.k_offset_v2.k_offset = offset); +} + + +extern inline void set_cpu_key_k_type (struct cpu_key * key, int type) +{ + (key->version == ITEM_VERSION_1) ? (key->on_disk_key.u.k_offset_v1.k_uniqueness = type2uniqueness (type)) : + (key->on_disk_key.u.k_offset_v2.k_type = type); +} + +extern inline void cpu_key_k_offset_dec (struct cpu_key * key) +{ + if (key->version == ITEM_VERSION_1) + key->on_disk_key.u.k_offset_v1.k_offset --; + else + key->on_disk_key.u.k_offset_v2.k_offset --; +} + + +#define is_direntry_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRENTRY) +#define is_direct_cpu_key(key) (cpu_key_k_type (key) == TYPE_DIRECT) +#define is_indirect_cpu_key(key) (cpu_key_k_type (key) == TYPE_INDIRECT) +#define is_statdata_cpu_key(key) (cpu_key_k_type (key) == TYPE_STAT_DATA) + + +/* are these used ? */ +#define is_direntry_cpu_ih(ih) (is_direntry_cpu_key (&((ih)->ih_key))) +#define is_direct_cpu_ih(ih) (is_direct_cpu_key (&((ih)->ih_key))) +#define is_indirect_cpu_ih(ih) (is_indirect_cpu_key (&((ih)->ih_key))) +#define is_statdata_cpu_ih(ih) (is_statdata_cpu_key (&((ih)->ih_key))) + + + + + +#define I_K_KEY_IN_ITEM(p_s_ih, p_s_key, n_blocksize) \ + ( ! COMP_SHORT_KEYS(p_s_ih, p_s_key) && \ + I_OFF_BYTE_IN_ITEM(p_s_ih, k_offset (p_s_key), n_blocksize) ) + +/* maximal length of item */ +#define MAX_ITEM_LEN(block_size) (block_size - BLKH_SIZE - IH_SIZE) +#define MIN_ITEM_LEN 1 + + +/* object identifier for root dir */ +#define REISERFS_ROOT_OBJECTID 2 +#define REISERFS_ROOT_PARENT_OBJECTID 1 +extern struct key root_key; + + + + +/* + * Picture represents a leaf of the S+tree + * ______________________________________________________ + * | | Array of | | | + * |Block | Object-Item | F r e e | Objects- | + * | head | Headers | S p a c e | Items | + * |______|_______________|___________________|___________| + */ + +/* Header of a disk block. More precisely, header of a formatted leaf + or internal node, and not the header of an unformatted node. */ +struct block_head { + __u16 blk_level; /* Level of a block in the tree. */ + __u16 blk_nr_item; /* Number of keys/items in a block. */ + __u16 blk_free_space; /* Block free space in bytes. */ + __u16 blk_reserved; + /* dump this in v4/planA */ + struct key blk_right_delim_key; /* kept only for compatibility */ +}; + +#define BLKH_SIZE (sizeof(struct block_head)) + +/* + * values for blk_level field of the struct block_head + */ + +#define FREE_LEVEL 0 /* when node gets removed from the tree its + blk_level is set to FREE_LEVEL. It is then + used to see whether the node is still in the + tree */ + +#define DISK_LEAF_NODE_LEVEL 1 /* Leaf node level.*/ + +/* Given the buffer head of a formatted node, resolve to the block head of that node. */ +#define B_BLK_HEAD(p_s_bh) ((struct block_head *)((p_s_bh)->b_data)) +/* Number of items that are in buffer. */ +#define B_NR_ITEMS(p_s_bh) (le16_to_cpu ( B_BLK_HEAD(p_s_bh)->blk_nr_item )) +#define B_LEVEL(bh) (le16_to_cpu ( B_BLK_HEAD(bh)->blk_level )) +#define B_FREE_SPACE(bh) (le16_to_cpu ( B_BLK_HEAD(bh)->blk_free_space )) + +#define PUT_B_NR_ITEMS(p_s_bh) do { B_BLK_HEAD(p_s_bh)->blk_nr_item = cpu_to_le16(val); } while (0) +#define PUT_B_LEVEL(bh, val) do { B_BLK_HEAD(bh)->blk_level = cpu_to_le16(val); } while (0) +#define PUT_B_FREE_SPACE(bh) do { B_BLK_HEAD(bh)->blk_free_space = cpu_to_le16(val); } while (0) + +/* Get right delimiting key. */ +#define B_PRIGHT_DELIM_KEY(p_s_bh) ( &(B_BLK_HEAD(p_s_bh)->blk_right_delim_key) ) + +/* Does the buffer contain a disk leaf. */ +#define B_IS_ITEMS_LEVEL(p_s_bh) ( B_BLK_HEAD(p_s_bh)->blk_level == DISK_LEAF_NODE_LEVEL ) + +/* Does the buffer contain a disk internal node */ +#define B_IS_KEYS_LEVEL(p_s_bh) ( B_BLK_HEAD(p_s_bh)->blk_level > DISK_LEAF_NODE_LEVEL &&\ + B_BLK_HEAD(p_s_bh)->blk_level <= MAX_HEIGHT ) + + + + +/***************************************************************************/ +/* STAT DATA */ +/***************************************************************************/ + + +// +// old stat data is 32 bytes long. We are going to distinguish new one by +// different size +// +struct stat_data_v1 +{ + __u16 sd_mode; /* file type, permissions */ + __u16 sd_nlink; /* number of hard links */ + __u16 sd_uid; /* owner */ + __u16 sd_gid; /* group */ + __u32 sd_size; /* file size */ + __u32 sd_atime; /* time of last access */ + __u32 sd_mtime; /* time file was last modified */ + __u32 sd_ctime; /* time inode (stat data) was last changed (except changes to sd_atime and sd_mtime) */ + union { + __u32 sd_rdev; + __u32 sd_blocks; /* number of blocks file uses */ + } __attribute__ ((__packed__)) u; + __u32 sd_first_direct_byte; /* first byte of file which is stored + in a direct item: except that if it + equals 1 it is a symlink and if it + equals ~(__u32)0 there is no + direct item. The existence of this + field really grates on me. Let's + replace it with a macro based on + sd_size and our tail suppression + policy. Someday. -Hans */ +} __attribute__ ((__packed__)); + +#define SD_V1_SIZE (sizeof(struct stat_data_v1)) + + +/* Stat Data on disk (reiserfs version of UFS disk inode minus the + address blocks) */ +struct stat_data { + __u16 sd_mode; /* file type, permissions */ + __u16 sd_reserved; + __u32 sd_nlink; /* number of hard links */ + __u64 sd_size; /* file size */ + __u32 sd_uid; /* owner */ + __u32 sd_gid; /* group */ + __u32 sd_atime; /* time of last access */ + __u32 sd_mtime; /* time file was last modified */ + __u32 sd_ctime; /* time inode (stat data) was last changed (except changes to sd_atime and sd_mtime) */ + __u32 sd_blocks; + union { + __u32 sd_rdev; + //__u32 sd_first_direct_byte; + /* first byte of file which is stored in a + direct item: except that if it equals 1 + it is a symlink and if it equals + ~(__u32)0 there is no direct item. The + existence of this field really grates + on me. Let's replace it with a macro + based on sd_size and our tail + suppression policy? */ + } __attribute__ ((__packed__)) u; +} __attribute__ ((__packed__)); +// +// this is 40 bytes long +// +#define SD_SIZE (sizeof(struct stat_data)) + +#define stat_data_v1(ih) (ih_version (ih) == ITEM_VERSION_1) + + +/***************************************************************************/ +/* DIRECTORY STRUCTURE */ +/***************************************************************************/ +/* + Picture represents the structure of directory items + ________________________________________________ + | Array of | | | | | | + | directory |N-1| N-2 | .... | 1st |0th| + | entry headers | | | | | | + |_______________|___|_____|________|_______|___| + <---- directory entries ------> + + First directory item has k_offset component 1. We store "." and ".." + in one item, always, we never split "." and ".." into differing + items. This makes, among other things, the code for removing + directories simpler. */ +#define SD_OFFSET 0 +#define SD_UNIQUENESS 0 +#define DOT_OFFSET 1 +#define DOT_DOT_OFFSET 2 +#define DIRENTRY_UNIQUENESS 500 + +/* */ +#define FIRST_ITEM_OFFSET 1 + +/* + Q: How to get key of object pointed to by entry from entry? + + A: Each directory entry has its header. This header has deh_dir_id and deh_objectid fields, those are key + of object, entry points to */ + +/* NOT IMPLEMENTED: + Directory will someday contain stat data of object */ + + + +struct reiserfs_de_head +{ + __u32 deh_offset; /* third component of the directory entry key */ + __u32 deh_dir_id; /* objectid of the parent directory of the object, that is referenced + by directory entry */ + __u32 deh_objectid; /* objectid of the object, that is referenced by directory entry */ + __u16 deh_location; /* offset of name in the whole item */ + __u16 deh_state; /* whether 1) entry contains stat data (for future), and 2) whether + entry is hidden (unlinked) */ +} __attribute__ ((__packed__)); +#define DEH_SIZE sizeof(struct reiserfs_de_head) + +/* empty directory contains two entries "." and ".." and their headers */ +#define EMPTY_DIR_SIZE \ +(DEH_SIZE * 2 + ROUND_UP (strlen (".")) + ROUND_UP (strlen (".."))) + +/* old format directories have this size when empty */ +#define EMPTY_DIR_SIZE_V1 (DEH_SIZE * 2 + 3) + +#define DEH_Statdata 0 /* not used now */ +#define DEH_Visible 2 + +/* bitops which deals with unaligned addrs; + needed for alpha port. --zam */ +#ifdef __alpha__ +# define ADDR_UNALIGNED_BITS (5) +#endif + +#ifdef ADDR_UNALIGNED_BITS + +# define aligned_address(addr) ((void *)((long)(addr) & ~((1UL << ADDR_UNALIGNED_BITS) - 1))) +# define unaligned_offset(addr) (((int)((long)(addr) & ((1 << ADDR_UNALIGNED_BITS) - 1))) << 3) + +# define set_bit_unaligned(nr, addr) set_bit((nr) + unaligned_offset(addr), aligned_address(addr)) +# define clear_bit_unaligned(nr, addr) clear_bit((nr) + unaligned_offset(addr), aligned_address(addr)) +# define test_bit_unaligned(nr, addr) test_bit((nr) + unaligned_offset(addr), aligned_address(addr)) + +#else + +# define set_bit_unaligned(nr, addr) set_bit(nr, addr) +# define clear_bit_unaligned(nr, addr) clear_bit(nr, addr) +# define test_bit_unaligned(nr, addr) test_bit(nr, addr) + +#endif + +#define deh_dir_id(deh) (__le32_to_cpu ((deh)->deh_dir_id)) +#define deh_objectid(deh) (__le32_to_cpu ((deh)->deh_objectid)) +#define deh_offset(deh) (__le32_to_cpu ((deh)->deh_offset)) + + +#define mark_de_with_sd(deh) set_bit_unaligned (DEH_Statdata, &((deh)->deh_state)) +#define mark_de_without_sd(deh) clear_bit_unaligned (DEH_Statdata, &((deh)->deh_state)) +#define mark_de_visible(deh) set_bit_unaligned (DEH_Visible, &((deh)->deh_state)) +#define mark_de_hidden(deh) clear_bit_unaligned (DEH_Visible, &((deh)->deh_state)) + +#define de_with_sd(deh) test_bit_unaligned (DEH_Statdata, &((deh)->deh_state)) +#define de_visible(deh) test_bit_unaligned (DEH_Visible, &((deh)->deh_state)) +#define de_hidden(deh) !test_bit_unaligned (DEH_Visible, &((deh)->deh_state)) + +/* compose directory item containing "." and ".." entries (entries are + not aligned to 4 byte boundary) */ +extern inline void make_empty_dir_item_v1 (char * body, __u32 dirid, __u32 objid, + __u32 par_dirid, __u32 par_objid) +{ + struct reiserfs_de_head * deh; + + memset (body, 0, EMPTY_DIR_SIZE_V1); + deh = (struct reiserfs_de_head *)body; + + /* direntry header of "." */ + deh[0].deh_offset = cpu_to_le32 (DOT_OFFSET); + deh[0].deh_dir_id = cpu_to_le32 (dirid); + deh[0].deh_objectid = cpu_to_le32 (objid); + deh[0].deh_location = cpu_to_le16 (EMPTY_DIR_SIZE_V1 - strlen (".")); + deh[0].deh_state = 0; + mark_de_visible(&(deh[0])); + + /* direntry header of ".." */ + deh[1].deh_offset = cpu_to_le32 (DOT_DOT_OFFSET); + /* key of ".." for the root directory */ + deh[1].deh_dir_id = cpu_to_le32 (par_dirid); + deh[1].deh_objectid = cpu_to_le32 (par_objid); + deh[1].deh_location = cpu_to_le16 (le16_to_cpu (deh[0].deh_location) - strlen ("..")); + deh[1].deh_state = 0; + mark_de_visible(&(deh[1])); + + /* copy ".." and "." */ + memcpy (body + deh[0].deh_location, ".", 1); + memcpy (body + deh[1].deh_location, "..", 2); +} + +/* compose directory item containing "." and ".." entries */ +extern inline void make_empty_dir_item (char * body, __u32 dirid, __u32 objid, + __u32 par_dirid, __u32 par_objid) +{ + struct reiserfs_de_head * deh; + + memset (body, 0, EMPTY_DIR_SIZE); + deh = (struct reiserfs_de_head *)body; + + /* direntry header of "." */ + deh[0].deh_offset = cpu_to_le32 (DOT_OFFSET); + deh[0].deh_dir_id = cpu_to_le32 (dirid); + deh[0].deh_objectid = cpu_to_le32 (objid); + deh[0].deh_location = cpu_to_le16 (EMPTY_DIR_SIZE - ROUND_UP (strlen ("."))); + deh[0].deh_state = 0; + mark_de_visible(&(deh[0])); + + /* direntry header of ".." */ + deh[1].deh_offset = cpu_to_le32 (DOT_DOT_OFFSET); + /* key of ".." for the root directory */ + deh[1].deh_dir_id = cpu_to_le32 (par_dirid); + deh[1].deh_objectid = cpu_to_le32 (par_objid); + deh[1].deh_location = cpu_to_le16 (le16_to_cpu (deh[0].deh_location) - ROUND_UP (strlen (".."))); + deh[1].deh_state = 0; + mark_de_visible(&(deh[1])); + + /* copy ".." and "." */ + memcpy (body + deh[0].deh_location, ".", 1); + memcpy (body + deh[1].deh_location, "..", 2); +} + + +/* array of the entry headers */ + /* get item body */ +#define B_I_PITEM(bh,ih) ( (bh)->b_data + (ih)->ih_item_location ) +#define B_I_DEH(bh,ih) ((struct reiserfs_de_head *)(B_I_PITEM(bh,ih))) + +/* length of the directory entry in directory item. This define + calculates length of i-th directory entry using directory entry + locations from dir entry head. When it calculates length of 0-th + directory entry, it uses length of whole item in place of entry + location of the non-existent following entry in the calculation. + See picture above.*/ +/* +#define I_DEH_N_ENTRY_LENGTH(ih,deh,i) \ +((i) ? (((deh)-1)->deh_location - (deh)->deh_location) : ((ih)->ih_item_len) - (deh)->deh_location) +*/ +extern inline int entry_length (struct buffer_head * bh, struct item_head * ih, + int pos_in_item) +{ + struct reiserfs_de_head * deh; + + deh = B_I_DEH (bh, ih) + pos_in_item; + if (pos_in_item) + return (le16_to_cpu ((deh - 1)->deh_location) - le16_to_cpu (deh->deh_location)); + return (le16_to_cpu (ih->ih_item_len) - le16_to_cpu (deh->deh_location)); +} + + + +/* number of entries in the directory item, depends on ENTRY_COUNT being at the start of directory dynamic data. */ +#define I_ENTRY_COUNT(ih) ((ih)->u.ih_entry_count) + + +/* name by bh, ih and entry_num */ +#define B_I_E_NAME(bh,ih,entry_num) ((char *)(bh->b_data + ih->ih_item_location + (B_I_DEH(bh,ih)+(entry_num))->deh_location)) + +// two entries per block (at least) +//#define REISERFS_MAX_NAME_LEN(block_size) +//((block_size - BLKH_SIZE - IH_SIZE - DEH_SIZE * 2) / 2) + +// two entries per block (at least) +#define REISERFS_MAX_NAME_LEN(block_size) 255 + + + + +/* this structure is used for operations on directory entries. It is + not a disk structure. */ +/* When reiserfs_find_entry or search_by_entry_key find directory + entry, they return filled reiserfs_dir_entry structure */ +struct reiserfs_dir_entry +{ + struct buffer_head * de_bh; + int de_item_num; + struct item_head * de_ih; + int de_entry_num; + struct reiserfs_de_head * de_deh; + int de_entrylen; + int de_namelen; + char * de_name; + char * de_gen_number_bit_string; + + __u32 de_dir_id; + __u32 de_objectid; + + struct cpu_key de_entry_key; +}; + +/* these defines are useful when a particular member of a reiserfs_dir_entry is needed */ + +/* pointer to file name, stored in entry */ +#define B_I_DEH_ENTRY_FILE_NAME(bh,ih,deh) (B_I_PITEM (bh, ih) + (deh)->deh_location) + +/* length of name */ +#define I_DEH_N_ENTRY_FILE_NAME_LENGTH(ih,deh,entry_num) \ +(I_DEH_N_ENTRY_LENGTH (ih, deh, entry_num) - (de_with_sd (deh) ? SD_SIZE : 0)) + + + +/* hash value occupies bits from 7 up to 30 */ +#define GET_HASH_VALUE(offset) ((offset) & 0x7fffff80LL) +/* generation number occupies 7 bits starting from 0 up to 6 */ +#define GET_GENERATION_NUMBER(offset) ((offset) & 0x7fLL) +#define MAX_GENERATION_NUMBER 127 + +#define SET_GENERATION_NUMBER(offset,gen_number) (GET_HASH_VALUE(offset)|(gen_number)) + + +/* + * Picture represents an internal node of the reiserfs tree + * ______________________________________________________ + * | | Array of | Array of | Free | + * |block | keys | pointers | space | + * | head | N | N+1 | | + * |______|_______________|___________________|___________| + */ + +/***************************************************************************/ +/* DISK CHILD */ +/***************************************************************************/ +/* Disk child pointer: The pointer from an internal node of the tree + to a node that is on disk. */ +struct disk_child { + __u32 dc_block_number; /* Disk child's block number. */ + __u16 dc_size; /* Disk child's used space. */ + __u16 dc_reserved; +}; + +#define DC_SIZE (sizeof(struct disk_child)) + +/* Get disk child by buffer header and position in the tree node. */ +#define B_N_CHILD(p_s_bh,n_pos) ((struct disk_child *)\ +((p_s_bh)->b_data+BLKH_SIZE+B_NR_ITEMS(p_s_bh)*KEY_SIZE+DC_SIZE*(n_pos))) + +/* Get disk child number by buffer header and position in the tree node. */ +#define B_N_CHILD_NUM(p_s_bh,n_pos) (le32_to_cpu (B_N_CHILD(p_s_bh,n_pos)->dc_block_number)) +#define PUT_B_N_CHILD_NUM(p_s_bh,n_pos, val) do { B_N_CHILD(p_s_bh,n_pos)->dc_block_number = cpu_to_le32(val); } while (0) + + /* maximal value of field child_size in structure disk_child */ + /* child size is the combined size of all items and their headers */ +#define MAX_CHILD_SIZE(bh) ((int)( (bh)->b_size - BLKH_SIZE )) + +/* amount of used space in buffer (not including block head) */ +#define B_CHILD_SIZE(cur) (MAX_CHILD_SIZE(cur)-(B_FREE_SPACE(cur))) + +/* max and min number of keys in internal node */ +#define MAX_NR_KEY(bh) ( (MAX_CHILD_SIZE(bh)-DC_SIZE)/(KEY_SIZE+DC_SIZE) ) +#define MIN_NR_KEY(bh) (MAX_NR_KEY(bh)/2) + +/***************************************************************************/ +/* PATH STRUCTURES AND DEFINES */ +/***************************************************************************/ + + +/* Search_by_key fills up the path from the root to the leaf as it descends the tree looking for the + key. It uses reiserfs_bread to try to find buffers in the cache given their block number. If it + does not find them in the cache it reads them from disk. For each node search_by_key finds using + reiserfs_bread it then uses bin_search to look through that node. bin_search will find the + position of the block_number of the next node if it is looking through an internal node. If it + is looking through a leaf node bin_search will find the position of the item which has key either + equal to given key, or which is the maximal key less than the given key. */ + +struct path_element { + struct buffer_head * pe_buffer; /* Pointer to the buffer at the path in the tree. */ + int pe_position; /* Position in the tree node which is placed in the */ + /* buffer above. */ +}; + +#define MAX_HEIGHT 5 /* maximal height of a tree. don't change this without changing JOURNAL_PER_BALANCE_CNT */ +#define EXTENDED_MAX_HEIGHT 7 /* Must be equals MAX_HEIGHT + FIRST_PATH_ELEMENT_OFFSET */ +#define FIRST_PATH_ELEMENT_OFFSET 2 /* Must be equal to at least 2. */ + +#define ILLEGAL_PATH_ELEMENT_OFFSET 1 /* Must be equal to FIRST_PATH_ELEMENT_OFFSET - 1 */ +#define MAX_FEB_SIZE 6 /* this MUST be MAX_HEIGHT + 1. See about FEB below */ + + + +/* We need to keep track of who the ancestors of nodes are. When we + perform a search we record which nodes were visited while + descending the tree looking for the node we searched for. This list + of nodes is called the path. This information is used while + performing balancing. Note that this path information may become + invalid, and this means we must check it when using it to see if it + is still valid. You'll need to read search_by_key and the comments + in it, especially about decrement_counters_in_path(), to understand + this structure. + +Paths make the code so much harder to work with and debug.... An +enormous number of bugs are due to them, and trying to write or modify +code that uses them just makes my head hurt. They are based on an +excessive effort to avoid disturbing the precious VFS code.:-( The +gods only know how we are going to SMP the code that uses them. +znodes are the way! */ + + +struct path { + int path_length; /* Length of the array above. */ + struct path_element path_elements[EXTENDED_MAX_HEIGHT]; /* Array of the path elements. */ + int pos_in_item; +}; + +#define pos_in_item(path) ((path)->pos_in_item) + +#define INITIALIZE_PATH(var) \ +struct path var = {ILLEGAL_PATH_ELEMENT_OFFSET, } + +/* Get path element by path and path position. */ +#define PATH_OFFSET_PELEMENT(p_s_path,n_offset) ((p_s_path)->path_elements +(n_offset)) + +/* Get buffer header at the path by path and path position. */ +#define PATH_OFFSET_PBUFFER(p_s_path,n_offset) (PATH_OFFSET_PELEMENT(p_s_path,n_offset)->pe_buffer) + +/* Get position in the element at the path by path and path position. */ +#define PATH_OFFSET_POSITION(p_s_path,n_offset) (PATH_OFFSET_PELEMENT(p_s_path,n_offset)->pe_position) + + +#define PATH_PLAST_BUFFER(p_s_path) (PATH_OFFSET_PBUFFER((p_s_path), (p_s_path)->path_length)) + /* you know, to the person who didn't + write this the macro name does not + at first suggest what it does. + Maybe POSITION_FROM_PATH_END? Or + maybe we should just focus on + dumping paths... -Hans */ +#define PATH_LAST_POSITION(p_s_path) (PATH_OFFSET_POSITION((p_s_path), (p_s_path)->path_length)) + + +#define PATH_PITEM_HEAD(p_s_path) B_N_PITEM_HEAD(PATH_PLAST_BUFFER(p_s_path),PATH_LAST_POSITION(p_s_path)) + +/* in do_balance leaf has h == 0 in contrast with path structure, + where root has level == 0. That is why we need these defines */ +#define PATH_H_PBUFFER(p_s_path, h) PATH_OFFSET_PBUFFER (p_s_path, p_s_path->path_length - (h)) /* tb->S[h] */ +#define PATH_H_PPARENT(path, h) PATH_H_PBUFFER (path, (h) + 1) /* tb->F[h] or tb->S[0]->b_parent */ +#define PATH_H_POSITION(path, h) PATH_OFFSET_POSITION (path, path->path_length - (h)) +#define PATH_H_B_ITEM_ORDER(path, h) PATH_H_POSITION(path, h + 1) /* tb->S[h]->b_item_order */ + +#define PATH_H_PATH_OFFSET(p_s_path, n_h) ((p_s_path)->path_length - (n_h)) + +#define get_bh(path) PATH_PLAST_BUFFER(path) +#define get_ih(path) PATH_PITEM_HEAD(path) +#define get_item_pos(path) PATH_LAST_POSITION(path) +#define get_item(path) ((void *)B_N_PITEM(PATH_PLAST_BUFFER(path), PATH_LAST_POSITION (path))) +#define item_moved(ih,path) comp_items(ih, path) +#define path_changed(ih,path) comp_items (ih, path) + + +/***************************************************************************/ +/* MISC */ +/***************************************************************************/ + +/* Size of pointer to the unformatted node. */ +#define UNFM_P_SIZE (sizeof(unp_t)) + +// in in-core inode key is stored on le form +#define INODE_PKEY(inode) ((struct key *)((inode)->u.reiserfs_i.i_key)) +//#define mark_tail_converted(inode) (atomic_set(&((inode)->u.reiserfs_i.i_converted),1)) +//#define unmark_tail_converted(inode) (atomic_set(&((inode)->u.reiserfs_i.i_converted), 0)) +//#define is_tail_converted(inode) (atomic_read(&((inode)->u.reiserfs_i.i_converted))) + + + +#define MAX_UL_INT 0xffffffff +#define MAX_INT 0x7ffffff +#define MAX_US_INT 0xffff + +///#define TOO_LONG_LENGTH (~0ULL) + +// reiserfs version 2 has max offset 60 bits. Version 1 - 32 bit offset +#define U32_MAX (~(__u32)0) +extern inline loff_t max_reiserfs_offset (struct inode * inode) +{ + if (inode_items_version (inode) == ITEM_VERSION_1) + return (loff_t)U32_MAX; + + return (loff_t)((~(__u64)0) >> 4); +} + + +/*#define MAX_KEY_UNIQUENESS MAX_UL_INT*/ +#define MAX_KEY_OBJECTID MAX_UL_INT + + +#define MAX_B_NUM MAX_UL_INT +#define MAX_FC_NUM MAX_US_INT + + +/* the purpose is to detect overflow of an unsigned short */ +#define REISERFS_LINK_MAX (MAX_US_INT - 1000) + + +/* The following defines are used in reiserfs_insert_item and reiserfs_append_item */ +#define REISERFS_KERNEL_MEM 0 /* reiserfs kernel memory mode */ +#define REISERFS_USER_MEM 1 /* reiserfs user memory mode */ + +#define fs_generation(s) ((s)->u.reiserfs_sb.s_generation_counter) +#define get_generation(s) atomic_read (&fs_generation(s)) +#define FILESYSTEM_CHANGED_TB(tb) (get_generation((tb)->tb_sb) != (tb)->fs_gen) +#define fs_changed(gen,s) (gen != get_generation (s)) + + +/***************************************************************************/ +/* FIXATE NODES */ +/***************************************************************************/ + +//#define VI_TYPE_STAT_DATA 1 +//#define VI_TYPE_DIRECT 2 +//#define VI_TYPE_INDIRECT 4 +//#define VI_TYPE_DIRECTORY 8 +//#define VI_TYPE_FIRST_DIRECTORY_ITEM 16 +//#define VI_TYPE_INSERTED_DIRECTORY_ITEM 32 + +#define VI_TYPE_LEFT_MERGEABLE 1 +#define VI_TYPE_RIGHT_MERGEABLE 2 + +/* To make any changes in the tree we always first find node, that + contains item to be changed/deleted or place to insert a new + item. We call this node S. To do balancing we need to decide what + we will shift to left/right neighbor, or to a new node, where new + item will be etc. To make this analysis simpler we build virtual + node. Virtual node is an array of items, that will replace items of + node S. (For instance if we are going to delete an item, virtual + node does not contain it). Virtual node keeps information about + item sizes and types, mergeability of first and last items, sizes + of all entries in directory item. We use this array of items when + calculating what we can shift to neighbors and how many nodes we + have to have if we do not any shiftings, if we shift to left/right + neighbor or to both. */ +struct virtual_item +{ + int vi_index; // index in the array of item operations + unsigned short vi_type; // left/right mergeability + unsigned short vi_item_len; /* length of item that it will have after balancing */ + struct item_head * vi_ih; + const char * vi_item; // body of item (old or new) + const void * vi_new_data; // 0 always but paste mode + void * vi_uarea; // item specific area +}; + + +struct virtual_node +{ + char * vn_free_ptr; /* this is a pointer to the free space in the buffer */ + unsigned short vn_nr_item; /* number of items in virtual node */ + short vn_size; /* size of node , that node would have if it has unlimited size and no balancing is performed */ + short vn_mode; /* mode of balancing (paste, insert, delete, cut) */ + short vn_affected_item_num; + short vn_pos_in_item; + struct item_head * vn_ins_ih; /* item header of inserted item, 0 for other modes */ + const void * vn_data; + struct virtual_item * vn_vi; /* array of items (including a new one, excluding item to be deleted) */ +}; + + +/***************************************************************************/ +/* TREE BALANCE */ +/***************************************************************************/ + +/* This temporary structure is used in tree balance algorithms, and + constructed as we go to the extent that its various parts are + needed. It contains arrays of nodes that can potentially be + involved in the balancing of node S, and parameters that define how + each of the nodes must be balanced. Note that in these algorithms + for balancing the worst case is to need to balance the current node + S and the left and right neighbors and all of their parents plus + create a new node. We implement S1 balancing for the leaf nodes + and S0 balancing for the internal nodes (S1 and S0 are defined in + our papers.)*/ + +#define MAX_FREE_BLOCK 7 /* size of the array of buffers to free at end of do_balance */ + +/* maximum number of FEB blocknrs on a single level */ +#define MAX_AMOUNT_NEEDED 2 + +/* someday somebody will prefix every field in this struct with tb_ */ +struct tree_balance +{ + int tb_mode; + int need_balance_dirty; + struct super_block * tb_sb; + struct reiserfs_transaction_handle *transaction_handle ; + struct path * tb_path; + struct buffer_head * L[MAX_HEIGHT]; /* array of left neighbors of nodes in the path */ + struct buffer_head * R[MAX_HEIGHT]; /* array of right neighbors of nodes in the path*/ + struct buffer_head * FL[MAX_HEIGHT]; /* array of fathers of the left neighbors */ + struct buffer_head * FR[MAX_HEIGHT]; /* array of fathers of the right neighbors */ + struct buffer_head * CFL[MAX_HEIGHT]; /* array of common parents of center node and its left neighbor */ + struct buffer_head * CFR[MAX_HEIGHT]; /* array of common parents of center node and its right neighbor */ + + struct buffer_head * FEB[MAX_FEB_SIZE]; /* array of empty buffers. Number of buffers in array equals + cur_blknum. */ + struct buffer_head * used[MAX_FEB_SIZE]; + struct buffer_head * thrown[MAX_FEB_SIZE]; + int lnum[MAX_HEIGHT]; /* array of number of items which must be + shifted to the left in order to balance the + current node; for leaves includes item that + will be partially shifted; for internal + nodes, it is the number of child pointers + rather than items. It includes the new item + being created. The code sometimes subtracts + one to get the number of wholly shifted + items for other purposes. */ + int rnum[MAX_HEIGHT]; /* substitute right for left in comment above */ + int lkey[MAX_HEIGHT]; /* array indexed by height h mapping the key delimiting L[h] and + S[h] to its item number within the node CFL[h] */ + int rkey[MAX_HEIGHT]; /* substitute r for l in comment above */ + int insert_size[MAX_HEIGHT]; /* the number of bytes by we are trying to add or remove from + S[h]. A negative value means removing. */ + int blknum[MAX_HEIGHT]; /* number of nodes that will replace node S[h] after + balancing on the level h of the tree. If 0 then S is + being deleted, if 1 then S is remaining and no new nodes + are being created, if 2 or 3 then 1 or 2 new nodes is + being created */ + + /* fields that are used only for balancing leaves of the tree */ + int cur_blknum; /* number of empty blocks having been already allocated */ + int s0num; /* number of items that fall into left most node when S[0] splits */ + int s1num; /* number of items that fall into first new node when S[0] splits */ + int s2num; /* number of items that fall into second new node when S[0] splits */ + int lbytes; /* number of bytes which can flow to the left neighbor from the left */ + /* most liquid item that cannot be shifted from S[0] entirely */ + /* if -1 then nothing will be partially shifted */ + int rbytes; /* number of bytes which will flow to the right neighbor from the right */ + /* most liquid item that cannot be shifted from S[0] entirely */ + /* if -1 then nothing will be partially shifted */ + int s1bytes; /* number of bytes which flow to the first new node when S[0] splits */ + /* note: if S[0] splits into 3 nodes, then items do not need to be cut */ + int s2bytes; + struct buffer_head * buf_to_free[MAX_FREE_BLOCK]; /* buffers which are to be freed after do_balance finishes by unfix_nodes */ + char * vn_buf; /* kmalloced memory. Used to create + virtual node and keep map of + dirtied bitmap blocks */ + int vn_buf_size; /* size of the vn_buf */ + struct virtual_node * tb_vn; /* VN starts after bitmap of bitmap blocks */ + + int fs_gen; /* saved value of `reiserfs_generation' counter + see FILESYSTEM_CHANGED() macro in reiserfs_fs.h */ +} ; + + +#if 0 + /* when balancing we potentially affect a 3 node wide column of nodes + in the tree (the top of the column may be tapered). C is the nodes + at the center of this column, and L and R are the nodes to the + left and right. */ + struct seal * L_path_seals[MAX_HEIGHT]; + struct seal * C_path_seals[MAX_HEIGHT]; + struct seal * R_path_seals[MAX_HEIGHT]; + char L_path_lock_types[MAX_HEIGHT]; /* 'r', 'w', or 'n' for read, write, or none */ + char C_path_lock_types[MAX_HEIGHT]; + char R_path_lock_types[MAX_HEIGHT]; + + + struct seal_list_elem * C_seal[MAX_HEIGHT]; /* array of seals on nodes in the path */ + struct seal_list_elem * L_seal[MAX_HEIGHT]; /* array of seals on left neighbors of nodes in the path */ + struct seal_list_elem * R_seal[MAX_HEIGHT]; /* array of seals on right neighbors of nodes in the path*/ + struct seal_list_elem * FL_seal[MAX_HEIGHT]; /* array of seals on fathers of the left neighbors */ + struct seal_list_elem * FR_seal[MAX_HEIGHT]; /* array of seals on fathers of the right neighbors */ + struct seal_list_elem * CFL_seal[MAX_HEIGHT]; /* array of seals on common parents of center node and its left neighbor */ + struct seal_list_elem * CFR_seal[MAX_HEIGHT]; /* array of seals on common parents of center node and its right neighbor */ + + struct char C_desired_lock_type[MAX_HEIGHT]; /* 'r', 'w', or 'n' for read, write, or none */ + struct char L_desired_lock_type[MAX_HEIGHT]; + struct char R_desired_lock_type[MAX_HEIGHT]; + struct char FL_desired_lock_type[MAX_HEIGHT]; + struct char FR_desired_lock_type[MAX_HEIGHT]; + struct char CFL_desired_lock_type[MAX_HEIGHT]; + struct char CFR_desired_lock_type[MAX_HEIGHT]; +#endif + + + + + +/* These are modes of balancing */ + +/* When inserting an item. */ +#define M_INSERT 'i' +/* When inserting into (directories only) or appending onto an already + existant item. */ +#define M_PASTE 'p' +/* When deleting an item. */ +#define M_DELETE 'd' +/* When truncating an item or removing an entry from a (directory) item. */ +#define M_CUT 'c' + +/* used when balancing on leaf level skipped (in reiserfsck) */ +#define M_INTERNAL 'n' + +/* When further balancing is not needed, then do_balance does not need + to be called. */ +#define M_SKIP_BALANCING 's' +#define M_CONVERT 'v' + +/* modes of leaf_move_items */ +#define LEAF_FROM_S_TO_L 0 +#define LEAF_FROM_S_TO_R 1 +#define LEAF_FROM_R_TO_L 2 +#define LEAF_FROM_L_TO_R 3 +#define LEAF_FROM_S_TO_SNEW 4 + +#define FIRST_TO_LAST 0 +#define LAST_TO_FIRST 1 + +/* used in do_balance for passing parent of node information that has + been gotten from tb struct */ +struct buffer_info { + struct tree_balance * tb; + struct buffer_head * bi_bh; + struct buffer_head * bi_parent; + int bi_position; +}; + + +/* there are 4 types of items: stat data, directory item, indirect, direct. ++-------------------+------------+--------------+------------+ +| | k_offset | k_uniqueness | mergeable? | ++-------------------+------------+--------------+------------+ +| stat data | 0 | 0 | no | ++-------------------+------------+--------------+------------+ +| 1st directory item| DOT_OFFSET |DIRENTRY_UNIQUENESS| no | +| non 1st directory | hash value | | yes | +| item | | | | ++-------------------+------------+--------------+------------+ +| indirect item | offset + 1 |TYPE_INDIRECT | if this is not the first indirect item of the object ++-------------------+------------+--------------+------------+ +| direct item | offset + 1 |TYPE_DIRECT | if not this is not the first direct item of the object ++-------------------+------------+--------------+------------+ +*/ + +struct item_operations { + int (*bytes_number) (struct item_head * ih, int block_size); + void (*decrement_key) (struct cpu_key *); + int (*is_left_mergeable) (struct key * ih, unsigned long bsize); + void (*print_item) (struct item_head *, char * item); + void (*check_item) (struct item_head *, char * item); + + int (*create_vi) (struct virtual_node * vn, struct virtual_item * vi, + int is_affected, int insert_size); + int (*check_left) (struct virtual_item * vi, int free, + int start_skip, int end_skip); + int (*check_right) (struct virtual_item * vi, int free); + int (*part_size) (struct virtual_item * vi, int from, int to); + int (*unit_num) (struct virtual_item * vi); + void (*print_vi) (struct virtual_item * vi); +}; + + +extern struct item_operations stat_data_ops, indirect_ops, direct_ops, + direntry_ops; +extern struct item_operations * item_ops [4]; + +#define op_bytes_number(ih,bsize) item_ops[le_ih_k_type (ih)]->bytes_number (ih, bsize) +#define op_is_left_mergeable(key,bsize) item_ops[le_key_k_type (le_key_version (key), key)]->is_left_mergeable (key, bsize) +#define op_print_item(ih,item) item_ops[le_ih_k_type (ih)]->print_item (ih, item) +#define op_check_item(ih,item) item_ops[le_ih_k_type (ih)]->check_item (ih, item) +#define op_create_vi(vn,vi,is_affected,insert_size) item_ops[le_ih_k_type ((vi)->vi_ih)]->create_vi (vn,vi,is_affected,insert_size) +#define op_check_left(vi,free,start_skip,end_skip) item_ops[(vi)->vi_index]->check_left (vi, free, start_skip, end_skip) +#define op_check_right(vi,free) item_ops[(vi)->vi_index]->check_right (vi, free) +#define op_part_size(vi,from,to) item_ops[(vi)->vi_index]->part_size (vi, from, to) +#define op_unit_num(vi) item_ops[(vi)->vi_index]->unit_num (vi) +#define op_print_vi(vi) item_ops[(vi)->vi_index]->print_vi (vi) + + + + + +#define COMP_KEYS comp_keys +#define COMP_SHORT_KEYS comp_short_keys +#define keys_of_same_object comp_short_keys + +/*#define COMP_KEYS(p_s_key1, p_s_key2) comp_keys((unsigned long *)(p_s_key1), (unsigned long *)(p_s_key2)) +#define COMP_SHORT_KEYS(p_s_key1, p_s_key2) comp_short_keys((unsigned long *)(p_s_key1), (unsigned long *)(p_s_key2))*/ + + +/* number of blocks pointed to by the indirect item */ +#define I_UNFM_NUM(p_s_ih) ( (p_s_ih)->ih_item_len / UNFM_P_SIZE ) + +/* the used space within the unformatted node corresponding to pos within the item pointed to by ih */ +#define I_POS_UNFM_SIZE(ih,pos,size) (((pos) == I_UNFM_NUM(ih) - 1 ) ? (size) - (ih)->u.ih_free_space : (size)) + +/* number of bytes contained by the direct item or the unformatted nodes the indirect item points to */ + + +/* get the item header */ +#define B_N_PITEM_HEAD(bh,item_num) ( (struct item_head * )((bh)->b_data + BLKH_SIZE) + (item_num) ) + +/* get key */ +#define B_N_PDELIM_KEY(bh,item_num) ( (struct key * )((bh)->b_data + BLKH_SIZE) + (item_num) ) + +/* get the key */ +#define B_N_PKEY(bh,item_num) ( &(B_N_PITEM_HEAD(bh,item_num)->ih_key) ) + +/* get item body */ +#define B_N_PITEM(bh,item_num) ( (bh)->b_data + B_N_PITEM_HEAD((bh),(item_num))->ih_item_location) + +/* get the stat data by the buffer header and the item order */ +#define B_N_STAT_DATA(bh,nr) \ +( (struct stat_data *)((bh)->b_data+B_N_PITEM_HEAD((bh),(nr))->ih_item_location ) ) + + /* following defines use reiserfs buffer header and item header */ + +/* get stat-data */ +#define B_I_STAT_DATA(bh, ih) ( (struct stat_data * )((bh)->b_data + (ih)->ih_item_location) ) + +// this is 3976 for size==4096 +#define MAX_DIRECT_ITEM_LEN(size) ((size) - BLKH_SIZE - 2*IH_SIZE - SD_SIZE - UNFM_P_SIZE) + +/* indirect items consist of entries which contain blocknrs, pos + indicates which entry, and B_I_POS_UNFM_POINTER resolves to the + blocknr contained by the entry pos points to */ +#define B_I_POS_UNFM_POINTER(bh,ih,pos) (*(((unp_t *)B_I_PITEM(bh,ih)) + (pos))) +#define PUT_B_I_POS_UNFM_POINTER(bh,ih,pos, val) do {*(((unp_t *)B_I_PITEM(bh,ih)) + (pos)) = cpu_to_le32(val); } while (0) + +/* Reiserfs buffer cache statistics. */ +#ifdef REISERFS_CACHE_STAT + struct reiserfs_cache_stat + { + int nr_reiserfs_ll_r_block; /* Number of block reads. */ + int nr_reiserfs_ll_w_block; /* Number of block writes. */ + int nr_reiserfs_schedule; /* Number of locked buffers waits. */ + unsigned long nr_reiserfs_bread; /* Number of calls to reiserfs_bread function */ + unsigned long nr_returns; /* Number of breads of buffers that were hoped to contain a key but did not after bread completed + (usually due to object shifting while bread was executing.) + In the code this manifests as the number + of times that the repeat variable is nonzero in search_by_key.*/ + unsigned long nr_fixed; /* number of calls of fix_nodes function */ + unsigned long nr_failed; /* number of calls of fix_nodes in which schedule occurred while the function worked */ + unsigned long nr_find1; /* How many times we access a child buffer using its direct pointer from an internal node.*/ + unsigned long nr_find2; /* Number of times there is neither a direct pointer to + nor any entry in the child list pointing to the buffer. */ + unsigned long nr_find3; /* When parent is locked (meaning that there are no direct pointers) + or parent is leaf and buffer to be found is an unformatted node. */ + } cache_stat; +#endif + +struct reiserfs_iget4_args { + __u32 objectid ; +} ; + +/***************************************************************************/ +/* FUNCTION DECLARATIONS */ +/***************************************************************************/ + +/*#ifdef __KERNEL__*/ + +/* journal.c see journal.c for all the comments here */ + +#define JOURNAL_TRANS_HALF 1018 /* must be correct to keep the desc and commit structs at 4k */ + + +/* first block written in a commit. */ +struct reiserfs_journal_desc { + __u32 j_trans_id ; /* id of commit */ + __u32 j_len ; /* length of commit. len +1 is the commit block */ + __u32 j_mount_id ; /* mount id of this trans*/ + __u32 j_realblock[JOURNAL_TRANS_HALF] ; /* real locations for each block */ + char j_magic[12] ; +} ; + +/* last block written in a commit */ +struct reiserfs_journal_commit { + __u32 j_trans_id ; /* must match j_trans_id from the desc block */ + __u32 j_len ; /* ditto */ + __u32 j_realblock[JOURNAL_TRANS_HALF] ; /* real locations for each block */ + char j_digest[16] ; /* md5 sum of all the blocks involved, including desc and commit. not used, kill it */ +} ; + +/* this header block gets written whenever a transaction is considered fully flushed, and is more recent than the +** last fully flushed transaction. fully flushed means all the log blocks and all the real blocks are on disk, +** and this transaction does not need to be replayed. +*/ +struct reiserfs_journal_header { + __u32 j_last_flush_trans_id ; /* id of last fully flushed transaction */ + __u32 j_first_unflushed_offset ; /* offset in the log of where to start replay after a crash */ + __u32 j_mount_id ; +} ; + +/* these are used to keep flush pages that contain converted direct items. +** if the page is not flushed before the transaction that converted it +** is committed, we risk losing data +** +** note, while a page is in this list, its counter is incremented. +*/ +struct reiserfs_page_list { + struct reiserfs_page_list *next ; + struct reiserfs_page_list *prev ; + struct page *page ; + unsigned long blocknr ; /* block number holding converted data */ + + /* if a transaction writer has the page locked the flush_page_list + ** function doesn't need to (and can't) get the lock while flushing + ** the page. do_not_lock needs to be set by anyone who calls journal_end + ** with a page lock held. They have to look in the inode and see + ** if the inode has the page they have locked in the flush list. + ** + ** this sucks. + */ + int do_not_lock ; +} ; + +extern task_queue reiserfs_commit_thread_tq ; +extern wait_queue_head_t reiserfs_commit_thread_wait ; + +/* biggest tunable defines are right here */ +#define JOURNAL_BLOCK_COUNT 8192 /* number of blocks in the journal */ +#define JOURNAL_MAX_BATCH 900 /* max blocks to batch into one transaction, don't make this any bigger than 900 */ +#define JOURNAL_MAX_COMMIT_AGE 30 +#define JOURNAL_MAX_TRANS_AGE 30 +#define JOURNAL_PER_BALANCE_CNT 12 /* must be >= (5 + 2 * (MAX_HEIGHT-2) + 1) */ + +/* both of these can be as low as 1, or as high as you want. The min is the +** number of 4k bitmap nodes preallocated on mount. New nodes are allocated +** as needed, and released when transactions are committed. On release, if +** the current number of nodes is > max, the node is freed, otherwise, +** it is put on a free list for faster use later. +*/ +#define REISERFS_MIN_BITMAP_NODES 10 +#define REISERFS_MAX_BITMAP_NODES 100 + +#define JBH_HASH_SHIFT 13 /* these are based on journal hash size of 8192 */ +#define JBH_HASH_MASK 8191 + +/* After several hours of tedious analysis, the following hash + * function won. Do not mess with it... -DaveM + */ +#define _jhashfn(dev,block) \ + ((((dev)<<(JBH_HASH_SHIFT - 6)) ^ ((dev)<<(JBH_HASH_SHIFT - 9))) ^ \ + (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12)))) +#define journal_hash(t,dev,block) ((t)[_jhashfn((dev),(block)) & JBH_HASH_MASK]) + +/* finds n'th buffer with 0 being the start of this commit. Needs to go away, j_ap_blocks has changed +** since I created this. One chunk of code in journal.c needs changing before deleting it +*/ +#define JOURNAL_BUFFER(j,n) ((j)->j_ap_blocks[((j)->j_start + (n)) % JOURNAL_BLOCK_COUNT]) + +void reiserfs_wait_on_write_block(struct super_block *s) ; +void reiserfs_block_writes(struct reiserfs_transaction_handle *th) ; +void reiserfs_allow_writes(struct super_block *s) ; +void reiserfs_check_lock_depth(char *caller) ; +void reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, int wait) ; +void reiserfs_restore_prepared_buffer(struct super_block *, struct buffer_head *bh) ; +int journal_init(struct super_block *) ; +int journal_release(struct reiserfs_transaction_handle*, struct super_block *) ; +int journal_release_error(struct reiserfs_transaction_handle*, struct super_block *) ; +int journal_end(struct reiserfs_transaction_handle *, struct super_block *, unsigned long) ; +int journal_end_sync(struct reiserfs_transaction_handle *, struct super_block *, unsigned long) ; +int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ; +int journal_mark_freed(struct reiserfs_transaction_handle *, struct super_block *, unsigned long blocknr) ; +int push_journal_writer(char *w) ; +int pop_journal_writer(int windex) ; +int journal_lock_dobalance(struct super_block *p_s_sb) ; +int journal_unlock_dobalance(struct super_block *p_s_sb) ; +int journal_transaction_should_end(struct reiserfs_transaction_handle *, int) ; +int reiserfs_in_journal(struct super_block *p_s_sb, kdev_t dev, unsigned long bl, int size, int searchall, unsigned long *next) ; +int journal_begin(struct reiserfs_transaction_handle *, struct super_block *p_s_sb, unsigned long) ; +int journal_join(struct reiserfs_transaction_handle *, struct super_block *p_s_sb, unsigned long) ; +struct super_block *reiserfs_get_super(kdev_t dev) ; +void flush_async_commits(struct super_block *p_s_sb) ; + +int remove_from_transaction(struct super_block *p_s_sb, unsigned long blocknr, int already_cleaned) ; +int remove_from_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, struct buffer_head *bh, int remove_freed) ; + +int buffer_journaled(struct buffer_head *bh) ; +int mark_buffer_journal_new(struct buffer_head *bh) ; +int reiserfs_sync_all_buffers(kdev_t dev, int wait) ; +int reiserfs_sync_buffers(kdev_t dev, int wait) ; +int reiserfs_add_page_to_flush_list(struct reiserfs_transaction_handle *, + struct inode *, struct buffer_head *) ; +int reiserfs_remove_page_from_flush_list(struct reiserfs_transaction_handle *, + struct inode *) ; + +int reiserfs_allocate_list_bitmaps(struct super_block *s, struct reiserfs_list_bitmap *, int) ; + + /* why is this kerplunked right here? */ +static inline int reiserfs_buffer_prepared(struct buffer_head *bh) { + if (bh && test_bit(BH_JPrepared, &bh->b_state)) + return 1 ; + else + return 0 ; +} + +/* buffer was journaled, waiting to get to disk */ +static inline int buffer_journal_dirty(struct buffer_head *bh) { + if (bh) + return test_bit(BH_JDirty_wait, &bh->b_state) ; + else + return 0 ; +} +static inline int mark_buffer_notjournal_dirty(struct buffer_head *bh) { + if (bh) + clear_bit(BH_JDirty_wait, &bh->b_state) ; + return 0 ; +} +static inline int mark_buffer_notjournal_new(struct buffer_head *bh) { + if (bh) { + clear_bit(BH_JNew, &bh->b_state) ; + } + return 0 ; +} + +/* objectid.c */ +__u32 reiserfs_get_unused_objectid (struct reiserfs_transaction_handle *th); +void reiserfs_release_objectid (struct reiserfs_transaction_handle *th, __u32 objectid_to_release); +int reiserfs_convert_objectid_map_v1(struct super_block *) ; + +/* stree.c */ +int B_IS_IN_TREE(struct buffer_head *); +extern inline void copy_key (void * to, void * from); +extern inline void copy_short_key (void * to, void * from); +extern inline void copy_item_head(void * p_v_to, void * p_v_from); + +// first key is in cpu form, second - le +extern inline int comp_keys (struct key * le_key, struct cpu_key * cpu_key); +extern inline int comp_short_keys (struct key * le_key, struct cpu_key * cpu_key); +extern inline void le_key2cpu_key (struct cpu_key * to, struct key * from); + +// both are cpu keys +extern inline int comp_cpu_keys (struct cpu_key *, struct cpu_key *); +extern inline int comp_short_cpu_keys (struct cpu_key *, struct cpu_key *); +extern inline void cpu_key2cpu_key (struct cpu_key *, struct cpu_key *); + +// both are in le form +extern inline int comp_le_keys (struct key *, struct key *); +extern inline int comp_short_le_keys (struct key *, struct key *); + +// +// get key version from on disk key - kludge +// +extern inline int le_key_version (struct key * key) +{ + int type; + + type = le16_to_cpu (key->u.k_offset_v2.k_type); + if (type != TYPE_DIRECT && type != TYPE_INDIRECT && type != TYPE_DIRENTRY) + return ITEM_VERSION_1; + + return ITEM_VERSION_2; + +} + + +extern inline void copy_key (void * to, void * from) +{ + memcpy (to, from, KEY_SIZE); +} + + +int comp_items (struct item_head * p_s_ih, struct path * p_s_path); +struct key * get_rkey (struct path * p_s_chk_path, struct super_block * p_s_sb); +inline int bin_search (void * p_v_key, void * p_v_base, int p_n_num, int p_n_width, int * p_n_pos); +int search_by_key (struct super_block *, struct cpu_key *, struct path *, int); +#define search_item(s,key,path) search_by_key (s, key, path, DISK_LEAF_NODE_LEVEL) +int search_for_position_by_key (struct super_block * p_s_sb, struct cpu_key * p_s_cpu_key, struct path * p_s_search_path); +extern inline void decrement_bcount (struct buffer_head * p_s_bh); +void decrement_counters_in_path (struct path * p_s_search_path); +void pathrelse (struct path * p_s_search_path); +int reiserfs_check_path(struct path *p) ; +void pathrelse_and_restore (struct super_block *s, struct path * p_s_search_path); + +int reiserfs_insert_item (struct reiserfs_transaction_handle *th, + struct path * path, + struct cpu_key * key, + struct item_head * ih, const char * body); + +int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th, + struct path * path, + struct cpu_key * key, + const char * body, int paste_size); + +int reiserfs_cut_from_item (struct reiserfs_transaction_handle *th, + struct path * path, + struct cpu_key * key, + struct inode * inode, + struct page *page, + loff_t new_file_size); + +int reiserfs_delete_item (struct reiserfs_transaction_handle *th, + struct path * path, + struct cpu_key * key, + struct inode * inode, + struct buffer_head * p_s_un_bh); + + +void reiserfs_delete_object (struct reiserfs_transaction_handle *th, struct inode * p_s_inode); +void reiserfs_do_truncate (struct reiserfs_transaction_handle *th, + struct inode * p_s_inode, struct page *, + int update_timestamps); +// +//void lock_inode_to_convert (struct inode * p_s_inode); +//void unlock_inode_after_convert (struct inode * p_s_inode); +//void increment_i_read_sync_counter (struct inode * p_s_inode); +//void decrement_i_read_sync_counter (struct inode * p_s_inode); + + +#define block_size(inode) ((inode)->i_sb->s_blocksize) +#define file_size(inode) ((inode)->i_size) +#define tail_size(inode) (file_size (inode) & (block_size (inode) - 1)) + +#define tail_has_to_be_packed(inode) (!dont_have_tails ((inode)->i_sb) &&\ +!STORE_TAIL_IN_UNFM(file_size (inode), tail_size(inode), block_size (inode))) + +/* +int get_buffer_by_range (struct super_block * p_s_sb, struct key * p_s_range_begin, struct key * p_s_range_end, + struct buffer_head ** pp_s_buf, unsigned long * p_n_objectid); +int get_buffers_from_range (struct super_block * p_s_sb, struct key * p_s_range_start, struct key * p_s_range_end, + struct buffer_head ** p_s_range_buffers, + int n_max_nr_buffers_to_return); +*/ + +#ifndef REISERFS_FSCK + +//inline int is_left_mergeable (struct item_head * ih, unsigned long bsize); + +#else + +int is_left_mergeable (struct super_block * s, struct path * path); +int is_right_mergeable (struct super_block * s, struct path * path); +int are_items_mergeable (struct item_head * left, struct item_head * right, int bsize); + +#endif +void padd_item (char * item, int total_length, int length); + + +/* inode.c */ + +int reiserfs_prepare_write(struct file *, struct page *, unsigned, unsigned) ; +void reiserfs_truncate_file(struct inode *, int update_timestamps) ; +void make_cpu_key (struct cpu_key * cpu_key, const struct inode * inode, loff_t offset, + int type, int key_length); +void make_le_item_head (struct item_head * ih, struct cpu_key * key, int version, + loff_t offset, int type, int length, int entry_count); +/*void store_key (struct key * key); +void forget_key (struct key * key);*/ +int reiserfs_get_block (struct inode * inode, long block, + struct buffer_head * bh_result, int create); +struct inode * reiserfs_iget (struct super_block * s, struct cpu_key * key); +void reiserfs_read_inode (struct inode * inode) ; +void reiserfs_read_inode2(struct inode * inode, void *p) ; +void reiserfs_delete_inode (struct inode * inode); +extern int reiserfs_notify_change(struct dentry * dentry, struct iattr * attr); +void reiserfs_write_inode (struct inode * inode, int) ; + +/* we don't mark inodes dirty, we just log them */ +void reiserfs_dirty_inode (struct inode * inode) ; + +struct inode * reiserfs_new_inode (struct reiserfs_transaction_handle *th, const struct inode * dir, int mode, + const char * symname, int item_len, + struct dentry *dentry, struct inode *inode, int * err); +int reiserfs_sync_inode (struct reiserfs_transaction_handle *th, struct inode * inode); +void reiserfs_update_sd (struct reiserfs_transaction_handle *th, struct inode * inode); +int reiserfs_inode_setattr(struct dentry *, struct iattr * attr); + +/* namei.c */ +inline void set_de_name_and_namelen (struct reiserfs_dir_entry * de); +int search_by_entry_key (struct super_block * sb, struct cpu_key * key, struct path * path, + struct reiserfs_dir_entry * de); +struct dentry * reiserfs_lookup (struct inode * dir, struct dentry *dentry); +int reiserfs_create (struct inode * dir, struct dentry *dentry, int mode); +int reiserfs_mknod (struct inode * dir_inode, struct dentry *dentry, int mode, int rdev); +int reiserfs_mkdir (struct inode * dir, struct dentry *dentry, int mode); +int reiserfs_rmdir (struct inode * dir, struct dentry *dentry); +int reiserfs_unlink (struct inode * dir, struct dentry *dentry); +int reiserfs_symlink (struct inode * dir, struct dentry *dentry, const char * symname); +int reiserfs_link (struct dentry * old_dentry, struct inode * dir, struct dentry *dentry); +int reiserfs_rename (struct inode * old_dir, struct dentry *old_dentry, struct inode * new_dir, struct dentry *new_dentry); + +/* super.c */ +inline void reiserfs_mark_buffer_dirty (struct buffer_head * bh, int flag); +inline void reiserfs_mark_buffer_clean (struct buffer_head * bh); +void reiserfs_panic (struct super_block * s, const char * fmt, ...); +void reiserfs_write_super (struct super_block * s); +void reiserfs_put_super (struct super_block * s); +int reiserfs_remount (struct super_block * s, int * flags, char * data); +/*int read_super_block (struct super_block * s, int size); +int read_bitmaps (struct super_block * s); +int read_old_bitmaps (struct super_block * s); +int read_old_super_block (struct super_block * s, int size);*/ +struct super_block * reiserfs_read_super (struct super_block * s, void * data, int silent); +int reiserfs_statfs (struct super_block * s, struct statfs * buf); + +/* dir.c */ +extern struct inode_operations reiserfs_dir_inode_operations; +extern struct file_operations reiserfs_dir_operations; + +/* tail_conversion.c */ +int direct2indirect (struct reiserfs_transaction_handle *, struct inode *, struct path *, struct buffer_head *, loff_t); +int indirect2direct (struct reiserfs_transaction_handle *, struct inode *, struct page *, struct path *, struct cpu_key *, loff_t, char *); +void reiserfs_unmap_buffer(struct buffer_head *) ; + + +/* file.c */ +extern struct inode_operations reiserfs_file_inode_operations; +extern struct file_operations reiserfs_file_operations; +extern struct address_space_operations reiserfs_address_space_operations ; +int get_new_buffer (struct reiserfs_transaction_handle *th, struct buffer_head *, + struct buffer_head **, struct path *); + + +/* buffer2.c */ +struct buffer_head * reiserfs_getblk (kdev_t n_dev, int n_block, int n_size); +void wait_buffer_until_released (struct buffer_head * bh); +struct buffer_head * reiserfs_bread (kdev_t n_dev, int n_block, int n_size); + + +/* fix_nodes.c */ +void * reiserfs_kmalloc (size_t size, int flags, struct super_block * s); +void reiserfs_kfree (const void * vp, size_t size, struct super_block * s); +int fix_nodes (int n_op_mode, struct tree_balance * p_s_tb, struct item_head * p_s_ins_ih, const void *); +void unfix_nodes (struct tree_balance *); +void free_buffers_in_tb (struct tree_balance * p_s_tb); + + +/* prints.c */ +void reiserfs_panic (struct super_block * s, const char * fmt, ...); +void reiserfs_warning (const char * fmt, ...); +void reiserfs_debug (struct super_block *s, int level, const char * fmt, ...); +void print_virtual_node (struct virtual_node * vn); +void print_indirect_item (struct buffer_head * bh, int item_num); +void store_print_tb (struct tree_balance * tb); +void print_cur_tb (char * mes); +void print_de (struct reiserfs_dir_entry * de); +void print_bi (struct buffer_info * bi, char * mes); +#define PRINT_LEAF_ITEMS 1 /* print all items */ +#define PRINT_DIRECTORY_ITEMS 2 /* print directory items */ +#define PRINT_DIRECT_ITEMS 4 /* print contents of direct items */ +void print_block (struct buffer_head * bh, ...); +void print_path (struct tree_balance * tb, struct path * path); +void print_bmap (struct super_block * s, int silent); +void print_bmap_block (int i, char * data, int size, int silent); +/*void print_super_block (struct super_block * s, char * mes);*/ +void print_objectid_map (struct super_block * s); +void print_block_head (struct buffer_head * bh, char * mes); +void check_leaf (struct buffer_head * bh); +void check_internal (struct buffer_head * bh); +void print_statistics (struct super_block * s); + +/* lbalance.c */ +int leaf_move_items (int shift_mode, struct tree_balance * tb, int mov_num, int mov_bytes, struct buffer_head * Snew); +int leaf_shift_left (struct tree_balance * tb, int shift_num, int shift_bytes); +int leaf_shift_right (struct tree_balance * tb, int shift_num, int shift_bytes); +void leaf_delete_items (struct buffer_info * cur_bi, int last_first, int first, int del_num, int del_bytes); +void leaf_insert_into_buf (struct buffer_info * bi, int before, + struct item_head * inserted_item_ih, const char * inserted_item_body, int zeros_number); +void leaf_paste_in_buffer (struct buffer_info * bi, int pasted_item_num, + int pos_in_item, int paste_size, const char * body, int zeros_number); +void leaf_cut_from_buffer (struct buffer_info * bi, int cut_item_num, int pos_in_item, + int cut_size); +void leaf_paste_entries (struct buffer_head * bh, int item_num, int before, + int new_entry_count, struct reiserfs_de_head * new_dehs, const char * records, int paste_size); +/* ibalance.c */ +int balance_internal (struct tree_balance * , int, int, struct item_head * , + struct buffer_head **); + +/* do_balance.c */ +inline void do_balance_mark_leaf_dirty (struct tree_balance * tb, + struct buffer_head * bh, int flag); +#define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty +#define do_balance_mark_sb_dirty do_balance_mark_leaf_dirty + +void do_balance (struct tree_balance * tb, struct item_head * ih, + const char * body, int flag); +void reiserfs_invalidate_buffer (struct tree_balance * tb, struct buffer_head * bh); + +int get_left_neighbor_position (struct tree_balance * tb, int h); +int get_right_neighbor_position (struct tree_balance * tb, int h); +void replace_key (struct tree_balance * tb, struct buffer_head *, int, struct buffer_head *, int); +void replace_lkey (struct tree_balance *, int, struct item_head *); +void replace_rkey (struct tree_balance *, int, struct item_head *); +void make_empty_node (struct buffer_info *); +struct buffer_head * get_FEB (struct tree_balance *); + +/* bitmap.c */ +int is_reusable (struct super_block * s, unsigned long block, int bit_value); +void reiserfs_free_block (struct reiserfs_transaction_handle *th, unsigned long); +int reiserfs_new_blocknrs (struct reiserfs_transaction_handle *th, + unsigned long * pblocknrs, unsigned long start_from, int amount_needed); +int reiserfs_new_unf_blocknrs (struct reiserfs_transaction_handle *th, + unsigned long * pblocknr, unsigned long start_from); +#ifdef REISERFS_PREALLOCATE +int reiserfs_new_unf_blocknrs2 (struct reiserfs_transaction_handle *th, + struct inode * inode, + unsigned long * pblocknr, + unsigned long start_from); + +void reiserfs_discard_prealloc (struct reiserfs_transaction_handle *th, + struct inode * inode); +#endif + +/* hashes.c */ +__u32 keyed_hash (const char *msg, int len); +__u32 yura_hash (const char *msg, int len); +__u32 r5_hash (const char *msg, int len); + +/* version.c */ +char *reiserfs_get_version_string(void) ; + +/* the ext2 bit routines adjust for big or little endian as +** appropriate for the arch, so in our laziness we use them rather +** than using the bit routines they call more directly. These +** routines must be used when changing on disk bitmaps. */ +#define reiserfs_test_and_set_le_bit ext2_set_bit +#define reiserfs_test_and_clear_le_bit ext2_clear_bit +#define reiserfs_test_le_bit ext2_test_bit +#define reiserfs_find_next_zero_le_bit ext2_find_next_zero_bit + + +// +// this was totally copied from from linux's +// find_first_zero_bit and changed a bit +// + +#ifdef __i386__ + +extern __inline__ int +find_first_nonzero_bit(void * addr, unsigned size) { + int res; + int __d0; + void *__d1; + + + if (!size) { + return (0); + } + __asm__ __volatile__ ( + "cld\n\t" + "xorl %%eax,%%eax\n\t" + "repe; scasl\n\t" + "je 1f\n\t" + "movl -4(%%edi),%%eax\n\t" + "subl $4, %%edi\n\t" + "bsfl %%eax,%%eax\n\t" + "1:\tsubl %%edx,%%edi\n\t" + "shll $3,%%edi\n\t" + "addl %%edi,%%eax" + :"=a" (res), + "=c"(__d0), "=D"(__d1) + :"1" ((size + 31) >> 5), "d" (addr), "2" (addr)); + return (res); +} + +#else /* __i386__ */ + +extern __inline__ int find_next_nonzero_bit(void * addr, unsigned size, unsigned offset) +{ + unsigned int * p = ((unsigned int *) addr) + (offset >> 5); + unsigned int result = offset & ~31UL; + unsigned int tmp; + + if (offset >= size) + return size; + size -= result; + offset &= 31UL; + if (offset) { + tmp = *p++; + /* set to zero first offset bits */ + tmp &= ~(~0UL >> (32-offset)); + if (size < 32) + goto found_first; + if (tmp != 0U) + goto found_middle; + size -= 32; + result += 32; + } + while (size >= 32) { + if ((tmp = *p++) != 0U) + goto found_middle; + result += 32; + size -= 32; + } + if (!size) + return result; + tmp = *p; +found_first: +found_middle: + return result + ffs(tmp); +} + +#define find_first_nonzero_bit(addr,size) find_next_nonzero_bit((addr), (size), 0) + +#endif /* 0 */ + +/* sometimes reiserfs_truncate may require to allocate few new blocks + to perform indirect2direct conversion. People probably used to + think, that truncate should work without problems on a filesystem + without free disk space. They may complain that they can not + truncate due to lack of free disk space. This spare space allows us + to not worry about it. 500 is probably too much, but it should be + absolutely safe */ +#define SPARE_SPACE 500 + +extern inline unsigned long reiserfs_get_journal_block(struct super_block *s) { + return le32_to_cpu(SB_DISK_SUPER_BLOCK(s)->s_journal_block) ; +} +extern inline unsigned long reiserfs_get_journal_orig_size(struct super_block *s) { + return le32_to_cpu(SB_DISK_SUPER_BLOCK(s)->s_orig_journal_size) ; +} + +/* prototypes from ioctl.c */ +int reiserfs_ioctl (struct inode * inode, struct file * filp, + unsigned int cmd, unsigned long arg); +int reiserfs_unpack (struct inode * inode, struct file * filp); + +/* ioctl's command */ +#define REISERFS_IOC_UNPACK _IOW(0xCD,1,long) + +#endif /* _LINUX_REISER_FS_H */ + + diff -u --recursive --new-file v2.4.0/linux/include/linux/reiserfs_fs_i.h linux/include/linux/reiserfs_fs_i.h --- v2.4.0/linux/include/linux/reiserfs_fs_i.h Wed Dec 31 16:00:00 1969 +++ linux/include/linux/reiserfs_fs_i.h Mon Jan 15 12:42:32 2001 @@ -0,0 +1,63 @@ +#ifndef _REISER_FS_I +#define _REISER_FS_I + +/* these are used to keep track of the pages that need +** flushing before the current transaction can commit +*/ +struct reiserfs_page_list ; + +struct reiserfs_inode_info { + __u32 i_key [4];/* key is still 4 32 bit integers */ + + /* this comment will be totally + cryptic to readers not familiar + with 3.5/3.6 format conversion, and + it does not consider that that 3.6 + might not be the last version */ + int i_version; // this says whether file is old or new + + int i_pack_on_close ; // file might need tail packing on close + + __u32 i_first_direct_byte; // offset of first byte stored in direct item. + + /* pointer to the page that must be flushed before + ** the current transaction can commit. + ** + ** this pointer is only used when the tail is converted back into + ** a direct item, or the file is deleted + */ + struct reiserfs_page_list *i_converted_page ; + + /* we save the id of the transaction when we did the direct->indirect + ** conversion. That allows us to flush the buffers to disk + ** without having to update this inode to zero out the converted + ** page variable + */ + int i_conversion_trans_id ; + + /* My guess is this contains the first + unused block of a sequence of + blocks plus the length of the + sequence, which I think is always + at least two at the time of the + preallocation. I really prefer + allocate on flush conceptually..... + + You know, it really annoys me when + code is this badly commented that I + have to guess what it does. + Neither I nor anyone else has time + for guessing what your + datastructures mean. -Hans */ + //For preallocation + int i_prealloc_block; + int i_prealloc_count; + + /* I regret that you think the below + is a comment you should make.... -Hans */ + //nopack-attribute + int nopack; +}; + + +#endif diff -u --recursive --new-file v2.4.0/linux/include/linux/reiserfs_fs_sb.h linux/include/linux/reiserfs_fs_sb.h --- v2.4.0/linux/include/linux/reiserfs_fs_sb.h Wed Dec 31 16:00:00 1969 +++ linux/include/linux/reiserfs_fs_sb.h Mon Jan 15 17:25:04 2001 @@ -0,0 +1,398 @@ +/* Copyright 1996-2000 Hans Reiser, see reiserfs/README for licensing + * and copyright details */ + +#ifndef _LINUX_REISER_FS_SB +#define _LINUX_REISER_FS_SB + +#ifdef __KERNEL__ +#include +#endif + +// +// super block's field values +// +/*#define REISERFS_VERSION 0 undistributed bitmap */ +/*#define REISERFS_VERSION 1 distributed bitmap and resizer*/ +#define REISERFS_VERSION_2 2 /* distributed bitmap, resizer, 64-bit, etc*/ +#define UNSET_HASH 0 // read_super will guess about, what hash names + // in directories were sorted with +#define TEA_HASH 1 +#define YURA_HASH 2 +#define R5_HASH 3 +#define DEFAULT_HASH R5_HASH + +/* this is the on disk super block */ + +struct reiserfs_super_block +{ + __u32 s_block_count; + __u32 s_free_blocks; /* free blocks count */ + __u32 s_root_block; /* root block number */ + __u32 s_journal_block; /* journal block number */ + __u32 s_journal_dev; /* journal device number */ + + /* Since journal size is currently a #define in a header file, if + ** someone creates a disk with a 16MB journal and moves it to a + ** system with 32MB journal default, they will overflow their journal + ** when they mount the disk. s_orig_journal_size, plus some checks + ** while mounting (inside journal_init) prevent that from happening + */ + + /* great comment Chris. Thanks. -Hans */ + + __u32 s_orig_journal_size; + __u32 s_journal_trans_max ; /* max number of blocks in a transaction. */ + __u32 s_journal_block_count ; /* total size of the journal. can change over time */ + __u32 s_journal_max_batch ; /* max number of blocks to batch into a trans */ + __u32 s_journal_max_commit_age ; /* in seconds, how old can an async commit be */ + __u32 s_journal_max_trans_age ; /* in seconds, how old can a transaction be */ + __u16 s_blocksize; /* block size */ + __u16 s_oid_maxsize; /* max size of object id array, see get_objectid() commentary */ + __u16 s_oid_cursize; /* current size of object id array */ + __u16 s_state; /* valid or error */ + char s_magic[12]; /* reiserfs magic string indicates that file system is reiserfs */ + __u32 s_hash_function_code; /* indicate, what hash function is being use to sort names in a directory*/ + __u16 s_tree_height; /* height of disk tree */ + __u16 s_bmap_nr; /* amount of bitmap blocks needed to address each block of file system */ + __u16 s_version; /* I'd prefer it if this was a string, + something like "3.6.4", and maybe + 16 bytes long mostly unused. We + don't need to save bytes in the + superblock. -Hans */ + __u16 s_reserved; + char s_unused[128] ; /* zero filled by mkreiserfs */ +} __attribute__ ((__packed__)); + +#define SB_SIZE (sizeof(struct reiserfs_super_block)) + +/* this is the super from 3.5.X, where X >= 10 */ +struct reiserfs_super_block_v1 +{ + __u32 s_block_count; /* blocks count */ + __u32 s_free_blocks; /* free blocks count */ + __u32 s_root_block; /* root block number */ + __u32 s_journal_block; /* journal block number */ + __u32 s_journal_dev; /* journal device number */ + __u32 s_orig_journal_size; /* size of the journal on FS creation. used to make sure they don't overflow it */ + __u32 s_journal_trans_max ; /* max number of blocks in a transaction. */ + __u32 s_journal_block_count ; /* total size of the journal. can change over time */ + __u32 s_journal_max_batch ; /* max number of blocks to batch into a trans */ + __u32 s_journal_max_commit_age ; /* in seconds, how old can an async commit be */ + __u32 s_journal_max_trans_age ; /* in seconds, how old can a transaction be */ + __u16 s_blocksize; /* block size */ + __u16 s_oid_maxsize; /* max size of object id array, see get_objectid() commentary */ + __u16 s_oid_cursize; /* current size of object id array */ + __u16 s_state; /* valid or error */ + char s_magic[16]; /* reiserfs magic string indicates that file system is reiserfs */ + __u16 s_tree_height; /* height of disk tree */ + __u16 s_bmap_nr; /* amount of bitmap blocks needed to address each block of file system */ + __u32 s_reserved; +} __attribute__ ((__packed__)); + +#define SB_SIZE_V1 (sizeof(struct reiserfs_super_block_v1)) + +/* LOGGING -- */ + +/* These all interelate for performance. +** +** If the journal block count is smaller than n transactions, you lose speed. +** I don't know what n is yet, I'm guessing 8-16. +** +** typical transaction size depends on the application, how often fsync is +** called, and how many metadata blocks you dirty in a 30 second period. +** The more small files (<16k) you use, the larger your transactions will +** be. +** +** If your journal fills faster than dirty buffers get flushed to disk, it must flush them before allowing the journal +** to wrap, which slows things down. If you need high speed meta data updates, the journal should be big enough +** to prevent wrapping before dirty meta blocks get to disk. +** +** If the batch max is smaller than the transaction max, you'll waste space at the end of the journal +** because journal_end sets the next transaction to start at 0 if the next transaction has any chance of wrapping. +** +** The large the batch max age, the better the speed, and the more meta data changes you'll lose after a crash. +** +*/ + +/* don't mess with these for a while */ + /* we have a node size define somewhere in reiserfs_fs.h. -Hans */ +#define JOURNAL_BLOCK_SIZE 4096 /* BUG gotta get rid of this */ +#define JOURNAL_MAX_CNODE 1500 /* max cnodes to allocate. */ +#define JOURNAL_TRANS_MAX 1024 /* biggest possible single transaction, don't change for now (8/3/99) */ +#define JOURNAL_HASH_SIZE 8192 +#define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating. Must be >= 2 */ +#define JOURNAL_LIST_COUNT 64 + +/* these are bh_state bit flag offset numbers, for use in the buffer head */ + +#define BH_JDirty 16 /* journal data needs to be written before buffer can be marked dirty */ +#define BH_JDirty_wait 18 /* commit is done, buffer marked dirty */ +#define BH_JNew 19 /* buffer allocated during this transaction, no need to write if freed during this trans too */ + +/* ugly. metadata blocks must be prepared before they can be logged. +** prepared means unlocked and cleaned. If the block is prepared, but not +** logged for some reason, any bits cleared while preparing it must be +** set again. +*/ +#define BH_JPrepared 20 /* block has been prepared for the log */ +#define BH_JRestore_dirty 22 /* restore the dirty bit later */ + +/* One of these for every block in every transaction +** Each one is in two hash tables. First, a hash of the current transaction, and after journal_end, a +** hash of all the in memory transactions. +** next and prev are used by the current transaction (journal_hash). +** hnext and hprev are used by journal_list_hash. If a block is in more than one transaction, the journal_list_hash +** links it in multiple times. This allows flush_journal_list to remove just the cnode belonging +** to a given transaction. +*/ +struct reiserfs_journal_cnode { + struct buffer_head *bh ; /* real buffer head */ + kdev_t dev ; /* dev of real buffer head */ + unsigned long blocknr ; /* block number of real buffer head, == 0 when buffer on disk */ + int state ; + struct reiserfs_journal_list *jlist ; /* journal list this cnode lives in */ + struct reiserfs_journal_cnode *next ; /* next in transaction list */ + struct reiserfs_journal_cnode *prev ; /* prev in transaction list */ + struct reiserfs_journal_cnode *hprev ; /* prev in hash list */ + struct reiserfs_journal_cnode *hnext ; /* next in hash list */ +}; + +struct reiserfs_bitmap_node { + int id ; + char *data ; + struct list_head list ; +} ; + +struct reiserfs_list_bitmap { + struct reiserfs_journal_list *journal_list ; + struct reiserfs_bitmap_node **bitmaps ; +} ; + +/* +** transaction handle which is passed around for all journal calls +*/ +struct reiserfs_transaction_handle { + /* ifdef it. -Hans */ + char *t_caller ; /* debugging use */ + int t_blocks_logged ; /* number of blocks this writer has logged */ + int t_blocks_allocated ; /* number of blocks this writer allocated */ + unsigned long t_trans_id ; /* sanity check, equals the current trans id */ + struct super_block *t_super ; /* super for this FS when journal_begin was + called. saves calls to reiserfs_get_super */ + +} ; + +/* +** one of these for each transaction. The most important part here is the j_realblock. +** this list of cnodes is used to hash all the blocks in all the commits, to mark all the +** real buffer heads dirty once all the commits hit the disk, +** and to make sure every real block in a transaction is on disk before allowing the log area +** to be overwritten */ +struct reiserfs_journal_list { + unsigned long j_start ; + unsigned long j_len ; + atomic_t j_nonzerolen ; + atomic_t j_commit_left ; + atomic_t j_flushing ; + atomic_t j_commit_flushing ; + atomic_t j_older_commits_done ; /* all commits older than this on disk*/ + unsigned long j_trans_id ; + time_t j_timestamp ; + struct reiserfs_list_bitmap *j_list_bitmap ; + struct buffer_head *j_commit_bh ; /* commit buffer head */ + struct reiserfs_journal_cnode *j_realblock ; + struct reiserfs_journal_cnode *j_freedlist ; /* list of buffers that were freed during this trans. free each of these on flush */ + wait_queue_head_t j_commit_wait ; /* wait for all the commit blocks to be flushed */ + wait_queue_head_t j_flush_wait ; /* wait for all the real blocks to be flushed */ +} ; + +struct reiserfs_page_list ; /* defined in reiserfs_fs.h */ + +struct reiserfs_journal { + struct buffer_head ** j_ap_blocks ; /* journal blocks on disk */ + struct reiserfs_journal_cnode *j_last ; /* newest journal block */ + struct reiserfs_journal_cnode *j_first ; /* oldest journal block. start here for traverse */ + + int j_state ; + unsigned long j_trans_id ; + unsigned long j_mount_id ; + unsigned long j_start ; /* start of current waiting commit (index into j_ap_blocks) */ + unsigned long j_len ; /* lenght of current waiting commit */ + unsigned long j_len_alloc ; /* number of buffers requested by journal_begin() */ + atomic_t j_wcount ; /* count of writers for current commit */ + unsigned long j_bcount ; /* batch count. allows turning X transactions into 1 */ + unsigned long j_first_unflushed_offset ; /* first unflushed transactions offset */ + unsigned long j_last_flush_trans_id ; /* last fully flushed journal timestamp */ + struct buffer_head *j_header_bh ; + + /* j_flush_pages must be flushed before the current transaction can + ** commit + */ + struct reiserfs_page_list *j_flush_pages ; + time_t j_trans_start_time ; /* time this transaction started */ + wait_queue_head_t j_wait ; /* wait journal_end to finish I/O */ + atomic_t j_wlock ; /* lock for j_wait */ + wait_queue_head_t j_join_wait ; /* wait for current transaction to finish before starting new one */ + atomic_t j_jlock ; /* lock for j_join_wait */ + int j_journal_list_index ; /* journal list number of the current trans */ + int j_list_bitmap_index ; /* number of next list bitmap to use */ + int j_must_wait ; /* no more journal begins allowed. MUST sleep on j_join_wait */ + int j_next_full_flush ; /* next journal_end will flush all journal list */ + int j_next_async_flush ; /* next journal_end will flush all async commits */ + + int j_cnode_used ; /* number of cnodes on the used list */ + int j_cnode_free ; /* number of cnodes on the free list */ + + struct reiserfs_journal_cnode *j_cnode_free_list ; + struct reiserfs_journal_cnode *j_cnode_free_orig ; /* orig pointer returned from vmalloc */ + + int j_free_bitmap_nodes ; + int j_used_bitmap_nodes ; + struct list_head j_bitmap_nodes ; + struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS] ; /* array of bitmaps to record the deleted blocks */ + struct reiserfs_journal_list j_journal_list[JOURNAL_LIST_COUNT] ; /* array of all the journal lists */ + struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE] ; /* hash table for real buffer heads in current trans */ + struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE] ; /* hash table for all the real buffer heads in all + the transactions */ +}; + +#define JOURNAL_DESC_MAGIC "ReIsErLB" /* ick. magic string to find desc blocks in the journal */ + + +typedef __u32 (*hashf_t) (const char *, int); + +/* reiserfs union of in-core super block data */ +struct reiserfs_sb_info +{ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + /* both the comment and the choice of + name are unclear for s_rs -Hans */ + struct reiserfs_super_block * s_rs; /* Pointer to the super block in the buffer */ + struct buffer_head ** s_ap_bitmap; /* array of buffers, holding block bitmap */ + struct reiserfs_journal *s_journal ; /* pointer to journal information */ + unsigned short s_mount_state; /* reiserfs state (valid, invalid) */ + + /* Comment? -Hans */ + void (*end_io_handler)(struct buffer_head *, int); + hashf_t s_hash_function; /* pointer to function which is used + to sort names in directory. Set on + mount */ + unsigned long s_mount_opt; /* reiserfs's mount options are set + here (currently - NOTAIL, NOLOG, + REPLAYONLY) */ + + /* Comment? -Hans */ + wait_queue_head_t s_wait; + /* To be obsoleted soon by per buffer seals.. -Hans */ + atomic_t s_generation_counter; // increased by one every time the + // tree gets re-balanced + + /* session statistics */ + int s_kmallocs; + int s_disk_reads; + int s_disk_writes; + int s_fix_nodes; + int s_do_balance; + int s_unneeded_left_neighbor; + int s_good_search_by_key_reada; + int s_bmaps; + int s_bmaps_without_search; + int s_direct2indirect; + int s_indirect2direct; +}; + + +#define NOTAIL 0 /* -o notail: no tails will be created in a session */ +#define REPLAYONLY 3 /* replay journal and return 0. Use by fsck */ +#define REISERFS_NOLOG 4 /* -o nolog: turn journalling off */ +#define REISERFS_CONVERT 5 /* -o conv: causes conversion of old + format super block to the new + format. If not specified - old + partition will be dealt with in a + manner of 3.5.x */ + +/* -o hash={tea, rupasov, r5, detect} is meant for properly mounting +** reiserfs disks from 3.5.19 or earlier. 99% of the time, this option +** is not required. If the normal autodection code can't determine which +** hash to use (because both hases had the same value for a file) +** use this option to force a specific hash. It won't allow you to override +** the existing hash on the FS, so if you have a tea hash disk, and mount +** with -o hash=rupasov, the mount will fail. +*/ +#define FORCE_TEA_HASH 6 /* try to force tea hash on mount */ +#define FORCE_RUPASOV_HASH 7 /* try to force rupasov hash on mount */ +#define FORCE_R5_HASH 8 /* try to force rupasov hash on mount */ +#define FORCE_HASH_DETECT 9 /* try to detect hash function on mount */ + + +/* used for testing experimental features, makes benchmarking new + features with and without more convenient, should never be used by + users in any code shipped to users (ideally) */ + +#define REISERFS_NO_BORDER 11 +#define REISERFS_NO_UNHASHED_RELOCATION 12 +#define REISERFS_HASHED_RELOCATION 13 +#define REISERFS_TEST4 14 + +#define REISERFS_TEST1 11 +#define REISERFS_TEST2 12 +#define REISERFS_TEST3 13 +#define REISERFS_TEST4 14 + +#define reiserfs_r5_hash(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << FORCE_R5_HASH)) +#define reiserfs_rupasov_hash(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << FORCE_RUPASOV_HASH)) +#define reiserfs_tea_hash(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << FORCE_TEA_HASH)) +#define reiserfs_hash_detect(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << FORCE_HASH_DETECT)) +#define reiserfs_no_border(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_NO_BORDER)) +#define reiserfs_no_unhashed_relocation(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_NO_UNHASHED_RELOCATION)) +#define reiserfs_hashed_relocation(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_HASHED_RELOCATION)) +#define reiserfs_test4(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_TEST4)) + +#define dont_have_tails(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << NOTAIL)) +#define replay_only(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REPLAYONLY)) +#define reiserfs_dont_log(s) ((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_NOLOG)) +#define old_format_only(s) ((SB_VERSION(s) != REISERFS_VERSION_2) && !((s)->u.reiserfs_sb.s_mount_opt & (1 << REISERFS_CONVERT))) + + +void reiserfs_file_buffer (struct buffer_head * bh, int list); +int reiserfs_is_super(struct super_block *s) ; +int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ; +int flush_old_commits(struct super_block *s, int) ; +int show_reiserfs_locks(void) ; +int reiserfs_resize(struct super_block *, unsigned long) ; + +#define CARRY_ON 0 +#define SCHEDULE_OCCURRED 1 + + +#define SB_BUFFER_WITH_SB(s) ((s)->u.reiserfs_sb.s_sbh) +#define SB_JOURNAL(s) ((s)->u.reiserfs_sb.s_journal) +#define SB_JOURNAL_LIST(s) (SB_JOURNAL(s)->j_journal_list) +#define SB_JOURNAL_LIST_INDEX(s) (SB_JOURNAL(s)->j_journal_list_index) +#define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free) +#define SB_AP_BITMAP(s) ((s)->u.reiserfs_sb.s_ap_bitmap) + + +// on-disk super block fields converted to cpu form +#define SB_DISK_SUPER_BLOCK(s) ((s)->u.reiserfs_sb.s_rs) +#define SB_BLOCK_COUNT(s) le32_to_cpu ((SB_DISK_SUPER_BLOCK(s)->s_block_count)) +#define SB_FREE_BLOCKS(s) le32_to_cpu ((SB_DISK_SUPER_BLOCK(s)->s_free_blocks)) +#define SB_REISERFS_MAGIC(s) (SB_DISK_SUPER_BLOCK(s)->s_magic) +#define SB_ROOT_BLOCK(s) le32_to_cpu ((SB_DISK_SUPER_BLOCK(s)->s_root_block)) +#define SB_TREE_HEIGHT(s) le16_to_cpu ((SB_DISK_SUPER_BLOCK(s)->s_tree_height)) +#define SB_REISERFS_STATE(s) le16_to_cpu ((SB_DISK_SUPER_BLOCK(s)->s_state)) +#define SB_VERSION(s) le16_to_cpu ((SB_DISK_SUPER_BLOCK(s)->s_version)) +#define SB_BMAP_NR(s) le16_to_cpu ((SB_DISK_SUPER_BLOCK(s)->s_bmap_nr)) + +#define PUT_SB_BLOCK_COUNT(s, val) do { SB_DISK_SUPER_BLOCK(s)->s_block_count = cpu_to_le32(val); } while (0) +#define PUT_SB_FREE_BLOCKS(s, val) do { SB_DISK_SUPER_BLOCK(s)->s_free_blocks = cpu_to_le32(val); } while (0) +#define PUT_SB_ROOT_BLOCK(s, val) do { SB_DISK_SUPER_BLOCK(s)->s_root_block = cpu_to_le32(val); } while (0) +#define PUT_SB_TREE_HEIGHT(s, val) do { SB_DISK_SUPER_BLOCK(s)->s_tree_height = cpu_to_le16(val); } while (0) +#define PUT_SB_REISERFS_STATE(s, val) do { SB_DISK_SUPER_BLOCK(s)->s_state = cpu_to_le16(val); } while (0) +#define PUT_SB_VERSION(s, val) do { SB_DISK_SUPER_BLOCK(s)->s_version = cpu_to_le16(val); } while (0) +#define PUT_SB_BMAP_NR(s, val) do { SB_DISK_SUPER_BLOCK(s)->s_bmap_nr = cpu_to_le16 (val); } while (0) + +#endif /* _LINUX_REISER_FS_SB */ + + + diff -u --recursive --new-file v2.4.0/linux/include/linux/sched.h linux/include/linux/sched.h --- v2.4.0/linux/include/linux/sched.h Thu Jan 4 14:50:47 2001 +++ linux/include/linux/sched.h Mon Jan 15 17:25:05 2001 @@ -219,13 +219,14 @@ unsigned long rss, total_vm, locked_vm; unsigned long def_flags; unsigned long cpu_vm_mask; - unsigned long swap_cnt; /* number of pages to swap on next pass */ unsigned long swap_address; /* Architecture-specific MM context */ mm_context_t context; }; +extern int mmlist_nr; + #define INIT_MM(name) \ { \ mmap: &init_mmap, \ @@ -542,8 +543,8 @@ #define CURRENT_TIME (xtime.tv_sec) -extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, unsigned int wq_mode)); -extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, unsigned int wq_mode)); +extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr)); +extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)); extern void FASTCALL(sleep_on(wait_queue_head_t *q)); extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q, signed long timeout)); @@ -552,12 +553,16 @@ signed long timeout)); extern void FASTCALL(wake_up_process(struct task_struct * tsk)); -#define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,WQ_FLAG_EXCLUSIVE) -#define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,0) -#define wake_up_sync(x) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,WQ_FLAG_EXCLUSIVE) -#define wake_up_interruptible(x) __wake_up((x),TASK_INTERRUPTIBLE,WQ_FLAG_EXCLUSIVE) -#define wake_up_interruptible_all(x) __wake_up((x),TASK_INTERRUPTIBLE,0) -#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE,WQ_FLAG_EXCLUSIVE) +#define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) +#define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) +#define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0) +#define wake_up_sync(x) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) +#define wake_up_sync_nr(x, nr) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) +#define wake_up_interruptible(x) __wake_up((x),TASK_INTERRUPTIBLE, 1) +#define wake_up_interruptible_nr(x, nr) __wake_up((x),TASK_INTERRUPTIBLE, nr) +#define wake_up_interruptible_all(x) __wake_up((x),TASK_INTERRUPTIBLE, 0) +#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1) +#define wake_up_interruptible_sync_nr(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, nr) extern int in_group_p(gid_t); extern int in_egroup_p(gid_t); diff -u --recursive --new-file v2.4.0/linux/include/linux/swap.h linux/include/linux/swap.h --- v2.4.0/linux/include/linux/swap.h Thu Jan 4 14:50:46 2001 +++ linux/include/linux/swap.h Mon Jan 15 17:25:04 2001 @@ -107,7 +107,7 @@ extern int page_launder(int, int); extern int free_shortage(void); extern int inactive_shortage(void); -extern void wakeup_kswapd(int); +extern void wakeup_kswapd(void); extern int try_to_free_pages(unsigned int gfp_mask); /* linux/mm/page_io.c */ diff -u --recursive --new-file v2.4.0/linux/ipc/shm.c linux/ipc/shm.c --- v2.4.0/linux/ipc/shm.c Fri Dec 29 14:21:48 2000 +++ linux/ipc/shm.c Sun Jan 14 11:22:21 2001 @@ -121,6 +121,7 @@ { shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; shm_rmid (shp->id); + shmem_lock(shp->shm_file, 0); fput (shp->shm_file); kfree (shp); } @@ -467,10 +468,10 @@ if(err) goto out_unlock; if(cmd==SHM_LOCK) { - shp->shm_file->f_dentry->d_inode->u.shmem_i.locked = 1; + shmem_lock(shp->shm_file, 1); shp->shm_flags |= SHM_LOCKED; } else { - shp->shm_file->f_dentry->d_inode->u.shmem_i.locked = 0; + shmem_lock(shp->shm_file, 0); shp->shm_flags &= ~SHM_LOCKED; } shm_unlock(shmid); diff -u --recursive --new-file v2.4.0/linux/kernel/context.c linux/kernel/context.c --- v2.4.0/linux/kernel/context.c Sun Dec 10 09:53:51 2000 +++ linux/kernel/context.c Fri Jan 12 09:52:41 2001 @@ -148,7 +148,7 @@ int start_context_thread(void) { - kernel_thread(context_thread, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGHAND); + kernel_thread(context_thread, NULL, CLONE_FS | CLONE_FILES); return 0; } diff -u --recursive --new-file v2.4.0/linux/kernel/fork.c linux/kernel/fork.c --- v2.4.0/linux/kernel/fork.c Wed Jan 3 20:45:26 2001 +++ linux/kernel/fork.c Wed Jan 10 14:53:54 2001 @@ -134,7 +134,6 @@ mm->mmap_cache = NULL; mm->map_count = 0; mm->cpu_vm_mask = 0; - mm->swap_cnt = 0; mm->swap_address = 0; pprev = &mm->mmap; for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { @@ -193,6 +192,7 @@ } spinlock_t mmlist_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; +int mmlist_nr; #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) @@ -246,6 +246,7 @@ { if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) { list_del(&mm->mmlist); + mmlist_nr--; spin_unlock(&mmlist_lock); exit_mmap(mm); mmdrop(mm); @@ -326,6 +327,7 @@ */ spin_lock(&mmlist_lock); list_add(&mm->mmlist, &oldmm->mmlist); + mmlist_nr++; spin_unlock(&mmlist_lock); if (retval) diff -u --recursive --new-file v2.4.0/linux/kernel/ksyms.c linux/kernel/ksyms.c --- v2.4.0/linux/kernel/ksyms.c Tue Jan 2 16:45:37 2001 +++ linux/kernel/ksyms.c Mon Jan 15 12:42:32 2001 @@ -159,6 +159,7 @@ EXPORT_SYMBOL(d_lookup); EXPORT_SYMBOL(__d_path); EXPORT_SYMBOL(mark_buffer_dirty); +EXPORT_SYMBOL(set_buffer_async_io); /* for reiserfs_writepage */ EXPORT_SYMBOL(__mark_buffer_dirty); EXPORT_SYMBOL(__mark_inode_dirty); EXPORT_SYMBOL(get_empty_filp); diff -u --recursive --new-file v2.4.0/linux/kernel/sched.c linux/kernel/sched.c --- v2.4.0/linux/kernel/sched.c Thu Jan 4 13:50:38 2001 +++ linux/kernel/sched.c Mon Jan 15 13:08:15 2001 @@ -690,19 +690,15 @@ } static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode, - unsigned int wq_mode, const int sync) + int nr_exclusive, const int sync) { struct list_head *tmp, *head; - struct task_struct *p, *best_exclusive; + struct task_struct *p; unsigned long flags; - int best_cpu, irq; if (!q) goto out; - best_cpu = smp_processor_id(); - irq = in_interrupt(); - best_exclusive = NULL; wq_write_lock_irqsave(&q->lock, flags); #if WAITQUEUE_DEBUG @@ -730,47 +726,27 @@ #if WAITQUEUE_DEBUG curr->__waker = (long)__builtin_return_address(0); #endif - /* - * If waking up from an interrupt context then - * prefer processes which are affine to this - * CPU. - */ - if (irq && (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE)) { - if (!best_exclusive) - best_exclusive = p; - if (p->processor == best_cpu) { - best_exclusive = p; - break; - } - } else { - if (sync) - wake_up_process_synchronous(p); - else - wake_up_process(p); - if (curr->flags & wq_mode & WQ_FLAG_EXCLUSIVE) - break; - } + if (sync) + wake_up_process_synchronous(p); + else + wake_up_process(p); + if ((curr->flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + break; } } - if (best_exclusive) { - if (sync) - wake_up_process_synchronous(best_exclusive); - else - wake_up_process(best_exclusive); - } wq_write_unlock_irqrestore(&q->lock, flags); out: return; } -void __wake_up(wait_queue_head_t *q, unsigned int mode, unsigned int wq_mode) +void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr) { - __wake_up_common(q, mode, wq_mode, 0); + __wake_up_common(q, mode, nr, 0); } -void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, unsigned int wq_mode) +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr) { - __wake_up_common(q, mode, wq_mode, 1); + __wake_up_common(q, mode, nr, 1); } #define SLEEP_ON_VAR \ diff -u --recursive --new-file v2.4.0/linux/mm/filemap.c linux/mm/filemap.c --- v2.4.0/linux/mm/filemap.c Tue Jan 2 18:59:45 2001 +++ linux/mm/filemap.c Mon Jan 15 17:14:41 2001 @@ -143,7 +143,8 @@ list_add(&page->list, &mapping->dirty_pages); spin_unlock(&pagecache_lock); - mark_inode_dirty_pages(mapping->host); + if (mapping->host) + mark_inode_dirty_pages(mapping->host); } /** @@ -306,7 +307,7 @@ */ age_page_up(page); if (inactive_shortage() > inactive_target / 2 && free_shortage()) - wakeup_kswapd(0); + wakeup_kswapd(); not_found: return page; } @@ -974,10 +975,6 @@ * accessed sequentially. */ if (ahead) { - if (reada_ok == 2) { - run_task_queue(&tq_disk); - } - filp->f_ralen += ahead; filp->f_rawin += filp->f_ralen; filp->f_raend = raend + ahead + 1; @@ -1835,7 +1832,8 @@ n->vm_end = end; setup_read_behavior(n, behavior); n->vm_raend = 0; - get_file(n->vm_file); + if (n->vm_file) + get_file(n->vm_file); if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); lock_vma_mappings(vma); @@ -1861,7 +1859,8 @@ n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT; setup_read_behavior(n, behavior); n->vm_raend = 0; - get_file(n->vm_file); + if (n->vm_file) + get_file(n->vm_file); if (n->vm_ops && n->vm_ops->open) n->vm_ops->open(n); lock_vma_mappings(vma); @@ -1893,7 +1892,8 @@ right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT; left->vm_raend = 0; right->vm_raend = 0; - atomic_add(2, &vma->vm_file->f_count); + if (vma->vm_file) + atomic_add(2, &vma->vm_file->f_count); if (vma->vm_ops && vma->vm_ops->open) { vma->vm_ops->open(left); diff -u --recursive --new-file v2.4.0/linux/mm/memory.c linux/mm/memory.c --- v2.4.0/linux/mm/memory.c Mon Jan 1 10:37:41 2001 +++ linux/mm/memory.c Mon Jan 8 15:39:38 2001 @@ -207,7 +207,8 @@ src_pte = pte_offset(src_pmd, address); dst_pte = pte_offset(dst_pmd, address); - + + spin_lock(&src->page_table_lock); do { pte_t pte = *src_pte; struct page *ptepage; @@ -240,16 +241,21 @@ cont_copy_pte_range: set_pte(dst_pte, pte); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) - goto out; + goto out_unlock; src_pte++; dst_pte++; } while ((unsigned long)src_pte & PTE_TABLE_MASK); + spin_unlock(&src->page_table_lock); cont_copy_pmd_range: src_pmd++; dst_pmd++; } while ((unsigned long)src_pmd & PMD_TABLE_MASK); } out: + return 0; + +out_unlock: + spin_unlock(&src->page_table_lock); return 0; nomem: diff -u --recursive --new-file v2.4.0/linux/mm/page_alloc.c linux/mm/page_alloc.c --- v2.4.0/linux/mm/page_alloc.c Wed Jan 3 09:59:06 2001 +++ linux/mm/page_alloc.c Mon Jan 15 12:35:12 2001 @@ -16,6 +16,7 @@ #include #include #include +#include int nr_swap_pages; int nr_active_pages; @@ -303,7 +304,7 @@ * an inactive page shortage, wake up kswapd. */ if (inactive_shortage() > inactive_target / 2 && free_shortage()) - wakeup_kswapd(0); + wakeup_kswapd(); /* * If we are about to get low on free pages and cleaning * the inactive_dirty pages would fix the situation, @@ -379,7 +380,7 @@ * - if we don't have __GFP_IO set, kswapd may be * able to free some memory we can't free ourselves */ - wakeup_kswapd(0); + wakeup_kswapd(); if (gfp_mask & __GFP_WAIT) { __set_current_state(TASK_RUNNING); current->policy |= SCHED_YIELD; @@ -404,7 +405,7 @@ * - we're doing a higher-order allocation * --> move pages to the free list until we succeed * - we're /really/ tight on memory - * --> wait on the kswapd waitqueue until memory is freed + * --> try to free pages ourselves with page_launder */ if (!(current->flags & PF_MEMALLOC)) { /* @@ -443,36 +444,20 @@ /* * When we arrive here, we are really tight on memory. * - * We wake up kswapd and sleep until kswapd wakes us - * up again. After that we loop back to the start. - * - * We have to do this because something else might eat - * the memory kswapd frees for us and we need to be - * reliable. Note that we don't loop back for higher - * order allocations since it is possible that kswapd - * simply cannot free a large enough contiguous area - * of memory *ever*. + * We try to free pages ourselves by: + * - shrinking the i/d caches. + * - reclaiming unused memory from the slab caches. + * - swapping/syncing pages to disk (done by page_launder) + * - moving clean pages from the inactive dirty list to + * the inactive clean list. (done by page_launder) */ - if ((gfp_mask & (__GFP_WAIT|__GFP_IO)) == (__GFP_WAIT|__GFP_IO)) { - wakeup_kswapd(1); + if (gfp_mask & __GFP_WAIT) { memory_pressure++; - if (!order) - goto try_again; - /* - * If __GFP_IO isn't set, we can't wait on kswapd because - * kswapd just might need some IO locks /we/ are holding ... - * - * SUBTLE: The scheduling point above makes sure that - * kswapd does get the chance to free memory we can't - * free ourselves... - */ - } else if (gfp_mask & __GFP_WAIT) { try_to_free_pages(gfp_mask); - memory_pressure++; + wakeup_bdflush(0); if (!order) goto try_again; } - } /* @@ -554,14 +539,8 @@ void free_pages(unsigned long addr, unsigned long order) { - struct page *fpage; - -#ifdef CONFIG_DISCONTIGMEM - if (addr == 0) return; -#endif - fpage = virt_to_page(addr); - if (VALID_PAGE(fpage)) - __free_pages(fpage, order); + if (addr != 0) + __free_pages(virt_to_page(addr), order); } /* diff -u --recursive --new-file v2.4.0/linux/mm/shmem.c linux/mm/shmem.c --- v2.4.0/linux/mm/shmem.c Fri Dec 29 14:21:48 2000 +++ linux/mm/shmem.c Sun Jan 14 11:22:21 2001 @@ -310,6 +310,8 @@ } /* We have the page */ SetPageUptodate (page); + if (info->locked) + page_cache_get(page); cached_page: UnlockPage (page); @@ -374,8 +376,7 @@ inode->i_fop = &shmem_dir_operations; break; case S_IFLNK: - inode->i_op = &page_symlink_inode_operations; - break; + BUG(); } spin_lock (&shmem_ilock); list_add (&inode->u.shmem_i.list, &shmem_inodes); @@ -401,6 +402,32 @@ return 0; } +void shmem_lock(struct file * file, int lock) +{ + struct inode * inode = file->f_dentry->d_inode; + struct shmem_inode_info * info = &inode->u.shmem_i; + struct page * page; + unsigned long idx, size; + + if (info->locked == lock) + return; + down(&inode->i_sem); + info->locked = lock; + size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + for (idx = 0; idx < size; idx++) { + page = find_lock_page(inode->i_mapping, idx); + if (!page) + continue; + if (!lock) { + /* release the extra count and our reference */ + page_cache_release(page); + page_cache_release(page); + } + UnlockPage(page); + } + up(&inode->i_sem); +} + /* * Lookup the data. This is trivial - if the dentry didn't already * exist, we know it is negative. @@ -528,19 +555,6 @@ return error; } -static int shmem_symlink(struct inode * dir, struct dentry *dentry, const char * symname) -{ - int error; - - error = shmem_mknod(dir, dentry, S_IFLNK | S_IRWXUGO, 0); - if (!error) { - int l = strlen(symname)+1; - struct inode *inode = dentry->d_inode; - error = block_symlink(inode, symname, l); - } - return error; -} - static int shmem_mmap(struct file * file, struct vm_area_struct * vma) { struct vm_operations_struct * ops; @@ -677,7 +691,6 @@ lookup: shmem_lookup, link: shmem_link, unlink: shmem_unlink, - symlink: shmem_symlink, mkdir: shmem_mkdir, rmdir: shmem_rmdir, mknod: shmem_mknod, diff -u --recursive --new-file v2.4.0/linux/mm/slab.c linux/mm/slab.c --- v2.4.0/linux/mm/slab.c Sun Oct 1 19:55:17 2000 +++ linux/mm/slab.c Wed Jan 10 14:24:32 2001 @@ -1702,7 +1702,7 @@ * kmem_cache_reap - Reclaim memory from caches. * @gfp_mask: the type of memory required. * - * Called from try_to_free_page(). + * Called from do_try_to_free_pages() and __alloc_pages() */ void kmem_cache_reap (int gfp_mask) { diff -u --recursive --new-file v2.4.0/linux/mm/vmalloc.c linux/mm/vmalloc.c --- v2.4.0/linux/mm/vmalloc.c Tue Nov 28 22:43:39 2000 +++ linux/mm/vmalloc.c Mon Jan 15 16:54:20 2001 @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff -u --recursive --new-file v2.4.0/linux/mm/vmscan.c linux/mm/vmscan.c --- v2.4.0/linux/mm/vmscan.c Wed Jan 3 20:45:26 2001 +++ linux/mm/vmscan.c Mon Jan 15 12:36:49 2001 @@ -35,45 +35,21 @@ * using a process that no longer actually exists (it might * have died while we slept). */ -static int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, int gfp_mask) +static void try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page) { pte_t pte; swp_entry_t entry; - struct page * page; - int onlist; - - pte = *page_table; - if (!pte_present(pte)) - goto out_failed; - page = pte_page(pte); - if ((!VALID_PAGE(page)) || PageReserved(page)) - goto out_failed; - - if (!mm->swap_cnt) - return 1; - - mm->swap_cnt--; - onlist = PageActive(page); /* Don't look at this pte if it's been accessed recently. */ if (ptep_test_and_clear_young(page_table)) { - age_page_up(page); - goto out_failed; + page->age += PAGE_AGE_ADV; + if (page->age > PAGE_AGE_MAX) + page->age = PAGE_AGE_MAX; + return; } - if (!onlist) - /* The page is still mapped, so it can't be freeable... */ - age_page_down_ageonly(page); - - /* - * If the page is in active use by us, or if the page - * is in active use by others, don't unmap it or - * (worse) start unneeded IO. - */ - if (page->age > 0) - goto out_failed; if (TryLockPage(page)) - goto out_failed; + return; /* From this point on, the odds are that we're going to * nuke this pte, so read and clear the pte. This hook @@ -87,9 +63,6 @@ * Is the page already in the swap cache? If so, then * we can just drop our reference to it without doing * any IO - it's already up-to-date on disk. - * - * Return 0, as we didn't actually free any real - * memory, and we should just continue our scan. */ if (PageSwapCache(page)) { entry.val = page->index; @@ -99,12 +72,12 @@ swap_duplicate(entry); set_pte(page_table, swp_entry_to_pte(entry)); drop_pte: - UnlockPage(page); mm->rss--; - deactivate_page(page); + if (!page->age) + deactivate_page(page); + UnlockPage(page); page_cache_release(page); -out_failed: - return 0; + return; } /* @@ -153,34 +126,20 @@ out_unlock_restore: set_pte(page_table, pte); UnlockPage(page); - return 0; + return; } -/* - * A new implementation of swap_out(). We do not swap complete processes, - * but only a small number of blocks, before we continue with the next - * process. The number of blocks actually swapped is determined on the - * number of page faults, that this process actually had in the last time, - * so we won't swap heavily used processes all the time ... - * - * Note: the priority argument is a hint on much CPU to waste with the - * swap block search, not a hint, of how much blocks to swap with - * each process. - * - * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de - */ - -static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int gfp_mask) +static int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count) { pte_t * pte; unsigned long pmd_end; if (pmd_none(*dir)) - return 0; + return count; if (pmd_bad(*dir)) { pmd_ERROR(*dir); pmd_clear(dir); - return 0; + return count; } pte = pte_offset(dir, address); @@ -190,28 +149,33 @@ end = pmd_end; do { - int result; - mm->swap_address = address + PAGE_SIZE; - result = try_to_swap_out(mm, vma, address, pte, gfp_mask); - if (result) - return result; + if (pte_present(*pte)) { + struct page *page = pte_page(*pte); + + if (VALID_PAGE(page) && !PageReserved(page)) { + try_to_swap_out(mm, vma, address, pte, page); + if (!--count) + break; + } + } address += PAGE_SIZE; pte++; } while (address && (address < end)); - return 0; + mm->swap_address = address + PAGE_SIZE; + return count; } -static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int gfp_mask) +static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count) { pmd_t * pmd; unsigned long pgd_end; if (pgd_none(*dir)) - return 0; + return count; if (pgd_bad(*dir)) { pgd_ERROR(*dir); pgd_clear(dir); - return 0; + return count; } pmd = pmd_offset(dir, address); @@ -221,23 +185,23 @@ end = pgd_end; do { - int result = swap_out_pmd(mm, vma, pmd, address, end, gfp_mask); - if (result) - return result; + count = swap_out_pmd(mm, vma, pmd, address, end, count); + if (!count) + break; address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); - return 0; + return count; } -static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int gfp_mask) +static int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count) { pgd_t *pgdir; unsigned long end; /* Don't swap out areas which are locked down */ if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) - return 0; + return count; pgdir = pgd_offset(mm, address); @@ -245,18 +209,17 @@ if (address >= end) BUG(); do { - int result = swap_out_pgd(mm, vma, pgdir, address, end, gfp_mask); - if (result) - return result; + count = swap_out_pgd(mm, vma, pgdir, address, end, count); + if (!count) + break; address = (address + PGDIR_SIZE) & PGDIR_MASK; pgdir++; } while (address && (address < end)); - return 0; + return count; } -static int swap_out_mm(struct mm_struct * mm, int gfp_mask) +static int swap_out_mm(struct mm_struct * mm, int count) { - int result = 0; unsigned long address; struct vm_area_struct* vma; @@ -276,8 +239,8 @@ address = vma->vm_start; for (;;) { - result = swap_out_vma(mm, vma, address, gfp_mask); - if (result) + count = swap_out_vma(mm, vma, address, count); + if (!count) goto out_unlock; vma = vma->vm_next; if (!vma) @@ -287,94 +250,63 @@ } /* Reset to 0 when we reach the end of address space */ mm->swap_address = 0; - mm->swap_cnt = 0; out_unlock: spin_unlock(&mm->page_table_lock); - return result; + return !count; } /* - * Select the task with maximal swap_cnt and try to swap out a page. * N.B. This function returns only 0 or 1. Return values != 1 from * the lower level routines result in continued processing. */ #define SWAP_SHIFT 5 #define SWAP_MIN 8 +static inline int swap_amount(struct mm_struct *mm) +{ + int nr = mm->rss >> SWAP_SHIFT; + return nr < SWAP_MIN ? SWAP_MIN : nr; +} + static int swap_out(unsigned int priority, int gfp_mask) { int counter; - int __ret = 0; + int retval = 0; + struct mm_struct *mm = current->mm; - /* - * We make one or two passes through the task list, indexed by - * assign = {0, 1}: - * Pass 1: select the swappable task with maximal RSS that has - * not yet been swapped out. - * Pass 2: re-assign rss swap_cnt values, then select as above. - * - * With this approach, there's no need to remember the last task - * swapped out. If the swap-out fails, we clear swap_cnt so the - * task won't be selected again until all others have been tried. - * - * Think of swap_cnt as a "shadow rss" - it tells us which process - * we want to page out (always try largest first). - */ - counter = (nr_threads << SWAP_SHIFT) >> priority; - if (counter < 1) - counter = 1; + /* Always start by trying to penalize the process that is allocating memory */ + if (mm) + retval = swap_out_mm(mm, swap_amount(mm)); - for (; counter >= 0; counter--) { + /* Then, look at the other mm's */ + counter = mmlist_nr >> priority; + do { struct list_head *p; - unsigned long max_cnt = 0; - struct mm_struct *best = NULL; - int assign = 0; - int found_task = 0; - select: + spin_lock(&mmlist_lock); p = init_mm.mmlist.next; - for (; p != &init_mm.mmlist; p = p->next) { - struct mm_struct *mm = list_entry(p, struct mm_struct, mmlist); - if (mm->rss <= 0) - continue; - found_task++; - /* Refresh swap_cnt? */ - if (assign == 1) { - mm->swap_cnt = (mm->rss >> SWAP_SHIFT); - if (mm->swap_cnt < SWAP_MIN) - mm->swap_cnt = SWAP_MIN; - } - if (mm->swap_cnt > max_cnt) { - max_cnt = mm->swap_cnt; - best = mm; - } - } + if (p == &init_mm.mmlist) + goto empty; + + /* Move it to the back of the queue.. */ + list_del(p); + list_add_tail(p, &init_mm.mmlist); + mm = list_entry(p, struct mm_struct, mmlist); - /* Make sure it doesn't disappear */ - if (best) - atomic_inc(&best->mm_users); + /* Make sure the mm doesn't disappear when we drop the lock.. */ + atomic_inc(&mm->mm_users); spin_unlock(&mmlist_lock); - /* - * We have dropped the tasklist_lock, but we - * know that "mm" still exists: we are running - * with the big kernel lock, and exit_mm() - * cannot race with us. - */ - if (!best) { - if (!assign && found_task > 0) { - assign = 1; - goto select; - } - break; - } else { - __ret = swap_out_mm(best, gfp_mask); - mmput(best); - break; - } - } - return __ret; + /* Walk about 6% of the address space each time */ + retval |= swap_out_mm(mm, swap_amount(mm)); + mmput(mm); + } while (--counter >= 0); + return retval; + +empty: + spin_unlock(&mmlist_lock); + return 0; } @@ -540,7 +472,6 @@ */ if (PageDirty(page)) { int (*writepage)(struct page *) = page->mapping->a_ops->writepage; - int result; if (!writepage) goto page_active; @@ -558,16 +489,12 @@ page_cache_get(page); spin_unlock(&pagemap_lru_lock); - result = writepage(page); + writepage(page); page_cache_release(page); /* And re-start the thing.. */ spin_lock(&pagemap_lru_lock); - if (result != 1) - continue; - /* writepage refused to do anything */ - set_page_dirty(page); - goto page_active; + continue; } /* @@ -808,6 +735,9 @@ int inactive_shortage(void) { int shortage = 0; + pg_data_t *pgdat = pgdat_list; + + /* Is the inactive dirty list too small? */ shortage += freepages.high; shortage += inactive_target; @@ -818,7 +748,27 @@ if (shortage > 0) return shortage; - return 0; + /* If not, do we have enough per-zone pages on the inactive list? */ + + shortage = 0; + + do { + int i; + for(i = 0; i < MAX_NR_ZONES; i++) { + int zone_shortage; + zone_t *zone = pgdat->node_zones+ i; + + zone_shortage = zone->pages_high; + zone_shortage -= zone->inactive_dirty_pages; + zone_shortage -= zone->inactive_clean_pages; + zone_shortage -= zone->free_pages; + if (zone_shortage > 0) + shortage += zone_shortage; + } + pgdat = pgdat->node_next; + } while (pgdat); + + return shortage; } /* @@ -833,72 +783,35 @@ * really care about latency. In that case we don't try * to free too many pages. */ +#define DEF_PRIORITY (6) static int refill_inactive(unsigned int gfp_mask, int user) { - int priority, count, start_count, made_progress; + int count, start_count, maxtry; count = inactive_shortage() + free_shortage(); if (user) count = (1 << page_cluster); start_count = count; - /* Always trim SLAB caches when memory gets low. */ - kmem_cache_reap(gfp_mask); - - priority = 6; + maxtry = 6; do { - made_progress = 0; - if (current->need_resched) { __set_current_state(TASK_RUNNING); schedule(); } - while (refill_inactive_scan(priority, 1)) { - made_progress = 1; + while (refill_inactive_scan(DEF_PRIORITY, 1)) { if (--count <= 0) goto done; } - /* - * don't be too light against the d/i cache since - * refill_inactive() almost never fail when there's - * really plenty of memory free. - */ - shrink_dcache_memory(priority, gfp_mask); - shrink_icache_memory(priority, gfp_mask); + /* If refill_inactive_scan failed, try to page stuff out.. */ + swap_out(DEF_PRIORITY, gfp_mask); - /* - * Then, try to page stuff out.. - */ - while (swap_out(priority, gfp_mask)) { - made_progress = 1; - if (--count <= 0) - goto done; - } - - /* - * If we either have enough free memory, or if - * page_launder() will be able to make enough - * free memory, then stop. - */ - if (!inactive_shortage() || !free_shortage()) - goto done; - - /* - * Only switch to a lower "priority" if we - * didn't make any useful progress in the - * last loop. - */ - if (!made_progress) - priority--; - } while (priority >= 0); - - /* Always end on a refill_inactive.., may sleep... */ - while (refill_inactive_scan(0, 1)) { - if (--count <= 0) - goto done; - } + if (--maxtry <= 0) + return 0; + + } while (inactive_shortage()); done: return (count < start_count); @@ -922,20 +835,29 @@ /* * If needed, we move pages from the active list - * to the inactive list. We also "eat" pages from - * the inode and dentry cache whenever we do this. + * to the inactive list. */ - if (free_shortage() || inactive_shortage()) { - shrink_dcache_memory(6, gfp_mask); - shrink_icache_memory(6, gfp_mask); + if (inactive_shortage()) ret += refill_inactive(gfp_mask, user); + + /* + * Delete pages from the inode and dentry caches and + * reclaim unused slab cache if memory is low. + */ + if (free_shortage()) { + shrink_dcache_memory(DEF_PRIORITY, gfp_mask); + shrink_icache_memory(DEF_PRIORITY, gfp_mask); } else { /* - * Reclaim unused slab cache memory. + * Illogical, but true. At least for now. + * + * If we're _not_ under shortage any more, we + * reap the caches. Why? Because a noticeable + * part of the caches are the buffer-heads, + * which we'll want to keep if under shortage. */ kmem_cache_reap(gfp_mask); - ret = 1; - } + } return ret; } @@ -988,13 +910,8 @@ static int recalc = 0; /* If needed, try to free some memory. */ - if (inactive_shortage() || free_shortage()) { - int wait = 0; - /* Do we need to do some synchronous flushing? */ - if (waitqueue_active(&kswapd_done)) - wait = 1; - do_try_to_free_pages(GFP_KSWAPD, wait); - } + if (inactive_shortage() || free_shortage()) + do_try_to_free_pages(GFP_KSWAPD, 0); /* * Do some (very minimal) background scanning. This @@ -1002,7 +919,7 @@ * every minute. This clears old referenced bits * and moves unused pages to the inactive list. */ - refill_inactive_scan(6, 0); + refill_inactive_scan(DEF_PRIORITY, 0); /* Once a second, recalculate some VM stats. */ if (time_after(jiffies, recalc + HZ)) { @@ -1010,11 +927,6 @@ recalculate_vm_stats(); } - /* - * Wake up everybody waiting for free memory - * and unplug the disk queue. - */ - wake_up_all(&kswapd_done); run_task_queue(&tq_disk); /* @@ -1045,33 +957,10 @@ } } -void wakeup_kswapd(int block) +void wakeup_kswapd(void) { - DECLARE_WAITQUEUE(wait, current); - - if (current == kswapd_task) - return; - - if (!block) { - if (waitqueue_active(&kswapd_wait)) - wake_up(&kswapd_wait); - return; - } - - /* - * Kswapd could wake us up before we get a chance - * to sleep, so we have to be very careful here to - * prevent SMP races... - */ - __set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&kswapd_done, &wait); - - if (waitqueue_active(&kswapd_wait)) - wake_up(&kswapd_wait); - schedule(); - - remove_wait_queue(&kswapd_done, &wait); - __set_current_state(TASK_RUNNING); + if (current != kswapd_task) + wake_up_process(kswapd_task); } /* @@ -1096,7 +985,7 @@ /* * Kreclaimd will move pages from the inactive_clean list to the * free list, in order to keep atomic allocations possible under - * all circumstances. Even when kswapd is blocked on IO. + * all circumstances. */ int kreclaimd(void *unused) { diff -u --recursive --new-file v2.4.0/linux/net/ipv4/igmp.c linux/net/ipv4/igmp.c --- v2.4.0/linux/net/ipv4/igmp.c Thu Sep 7 08:32:01 2000 +++ linux/net/ipv4/igmp.c Tue Jan 9 10:54:57 2001 @@ -504,8 +504,8 @@ im->timer.function=&igmp_timer_expire; im->unsolicit_count = IGMP_Unsolicited_Report_Count; im->reporter = 0; - im->loaded = 0; #endif + im->loaded = 0; write_lock_bh(&in_dev->lock); im->next=in_dev->mc_list; in_dev->mc_list=im; diff -u --recursive --new-file v2.4.0/linux/net/ipv4/tcp.c linux/net/ipv4/tcp.c --- v2.4.0/linux/net/ipv4/tcp.c Tue Nov 28 21:53:45 2000 +++ linux/net/ipv4/tcp.c Wed Jan 10 14:12:12 2001 @@ -954,7 +954,7 @@ */ skb = sk->write_queue.prev; if (tp->send_head && - (mss_now - skb->len) > 0) { + (mss_now > skb->len)) { copy = skb->len; if (skb_tailroom(skb) > 0) { int last_byte_was_odd = (copy % 4); diff -u --recursive --new-file v2.4.0/linux/net/ipv4/tcp_input.c linux/net/ipv4/tcp_input.c --- v2.4.0/linux/net/ipv4/tcp_input.c Fri Dec 29 14:07:24 2000 +++ linux/net/ipv4/tcp_input.c Wed Jan 10 14:12:12 2001 @@ -1705,7 +1705,7 @@ if ((__s32)when < (__s32)tp->rttvar) when = tp->rttvar; - tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, when); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, min(when, TCP_RTO_MAX)); } } diff -u --recursive --new-file v2.4.0/linux/net/sunrpc/sunrpc_syms.c linux/net/sunrpc/sunrpc_syms.c --- v2.4.0/linux/net/sunrpc/sunrpc_syms.c Fri Apr 21 16:08:52 2000 +++ linux/net/sunrpc/sunrpc_syms.c Thu Jan 11 15:53:02 2001 @@ -36,6 +36,7 @@ EXPORT_SYMBOL(rpciod_up); EXPORT_SYMBOL(rpc_new_task); EXPORT_SYMBOL(rpc_wake_up_status); +EXPORT_SYMBOL(rpc_release_task); /* RPC client functions */ EXPORT_SYMBOL(rpc_create_client); diff -u --recursive --new-file v2.4.0/linux/scripts/checkconfig.pl linux/scripts/checkconfig.pl --- v2.4.0/linux/scripts/checkconfig.pl Tue Aug 31 09:33:09 1999 +++ linux/scripts/checkconfig.pl Mon Jan 15 15:31:19 2001 @@ -14,6 +14,7 @@ # Initialize variables. my $fInComment = 0; + my $fInString = 0; my $fUseConfig = 0; my $iLinuxConfig = 0; my %configList = (); @@ -23,6 +24,10 @@ # Strip comments. $fInComment && (s+^.*?\*/+ +o ? ($fInComment = 0) : next); m+/\*+o && (s+/\*.*?\*/+ +go, (s+/\*.*$+ +o && ($fInComment = 1))); + + # Strip strings. + $fInString && (s+^.*?"+ +o ? ($fInString = 0) : next); + m+"+o && (s+".*?"+ +go, (s+".*$+ +o && ($fInString = 1))); # Pick up definitions. if ( m/^\s*#/o )